diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5669bd8981d88d1e36835ffcbd7526af8f983861
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/_version.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/_version.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95186df5b3792449390b3cdfc58ce0d74b26c09e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/_version.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/archive.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/archive.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73ee8d0959379988180c886f09b6361efa56ff39
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/archive.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/asyn.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/asyn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..822ed4403b350676bbdd1f6e1e4762d26ee0d709
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/asyn.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/caching.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/caching.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f1d7348a488cadd338fbf1a9c328264532056cd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/caching.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/callbacks.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/callbacks.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a950ee23a87d928ca1f5baf038e85e47702961df
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/callbacks.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/compression.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/compression.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0f1e50604422b0d9a81fe3e64a945f9142452ff
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/compression.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/config.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6d310777410b081a89f9a115e2fd20e5d8d46cf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/config.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/conftest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/conftest.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..867792a1f1467557441b0fe26cd1f07458203f21
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/conftest.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/dircache.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/dircache.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b925283cb545e2d57135995b0de76cb05bfe7a2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/dircache.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/exceptions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/exceptions.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f05480658a3fd5c31d09032fcbc3595efb88e08d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/exceptions.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/fuse.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/fuse.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9730ccc4546c51521088b4eecb4bd540b37bd820
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/fuse.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/generic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/generic.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d73577acb483a732916e7dc8c1a1e9bfeefaca6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/generic.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/gui.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/gui.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6cd2c0173d810137bf8fd93be662bd17e2b48a35
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/gui.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/mapping.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/mapping.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06256a50e92559ecccec367e932236eebe7b7c45
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/mapping.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/parquet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/parquet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89707ba3e0f24e6acaa247dcf1951101ef8ce6dd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/parquet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/registry.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/registry.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a02bd58909c197344263f11e8bd459bee5ea8da3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/registry.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/spec.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/spec.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ca02ac00841a2ae9c5f13b670384643c7074428
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/spec.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/transaction.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/transaction.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04c67f829d305f6e67efe97c2fe4febe14f88741
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/transaction.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c98399c08a2fe92a230f85e8cdff536bb0422c3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/__pycache__/utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/arrow.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/arrow.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9fea70d25fe2974e19d35186eee5de60a008eb4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/arrow.py
@@ -0,0 +1,304 @@
+import errno
+import io
+import os
+import secrets
+import shutil
+from contextlib import suppress
+from functools import cached_property, wraps
+from urllib.parse import parse_qs
+
+from fsspec.spec import AbstractFileSystem
+from fsspec.utils import (
+    get_package_version_without_import,
+    infer_storage_options,
+    mirror_from,
+    tokenize,
+)
+
+
+def wrap_exceptions(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except OSError as exception:
+            if not exception.args:
+                raise
+
+            message, *args = exception.args
+            if isinstance(message, str) and "does not exist" in message:
+                raise FileNotFoundError(errno.ENOENT, message) from exception
+            else:
+                raise
+
+    return wrapper
+
+
+PYARROW_VERSION = None
+
+
+class ArrowFSWrapper(AbstractFileSystem):
+    """FSSpec-compatible wrapper of pyarrow.fs.FileSystem.
+
+    Parameters
+    ----------
+    fs : pyarrow.fs.FileSystem
+
+    """
+
+    root_marker = "/"
+
+    def __init__(self, fs, **kwargs):
+        global PYARROW_VERSION
+        PYARROW_VERSION = get_package_version_without_import("pyarrow")
+        self.fs = fs
+        super().__init__(**kwargs)
+
+    @property
+    def protocol(self):
+        return self.fs.type_name
+
+    @cached_property
+    def fsid(self):
+        return "hdfs_" + tokenize(self.fs.host, self.fs.port)
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        ops = infer_storage_options(path)
+        path = ops["path"]
+        if path.startswith("//"):
+            # special case for "hdfs://path" (without the triple slash)
+            path = path[1:]
+        return path
+
+    def ls(self, path, detail=False, **kwargs):
+        path = self._strip_protocol(path)
+        from pyarrow.fs import FileSelector
+
+        entries = [
+            self._make_entry(entry)
+            for entry in self.fs.get_file_info(FileSelector(path))
+        ]
+        if detail:
+            return entries
+        else:
+            return [entry["name"] for entry in entries]
+
+    def info(self, path, **kwargs):
+        path = self._strip_protocol(path)
+        [info] = self.fs.get_file_info([path])
+        return self._make_entry(info)
+
+    def exists(self, path):
+        path = self._strip_protocol(path)
+        try:
+            self.info(path)
+        except FileNotFoundError:
+            return False
+        else:
+            return True
+
+    def _make_entry(self, info):
+        from pyarrow.fs import FileType
+
+        if info.type is FileType.Directory:
+            kind = "directory"
+        elif info.type is FileType.File:
+            kind = "file"
+        elif info.type is FileType.NotFound:
+            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), info.path)
+        else:
+            kind = "other"
+
+        return {
+            "name": info.path,
+            "size": info.size,
+            "type": kind,
+            "mtime": info.mtime,
+        }
+
+    @wrap_exceptions
+    def cp_file(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1).rstrip("/")
+        path2 = self._strip_protocol(path2).rstrip("/")
+
+        with self._open(path1, "rb") as lstream:
+            tmp_fname = f"{path2}.tmp.{secrets.token_hex(6)}"
+            try:
+                with self.open(tmp_fname, "wb") as rstream:
+                    shutil.copyfileobj(lstream, rstream)
+                self.fs.move(tmp_fname, path2)
+            except BaseException:  # noqa
+                with suppress(FileNotFoundError):
+                    self.fs.delete_file(tmp_fname)
+                raise
+
+    @wrap_exceptions
+    def mv(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1).rstrip("/")
+        path2 = self._strip_protocol(path2).rstrip("/")
+        self.fs.move(path1, path2)
+
+    @wrap_exceptions
+    def rm_file(self, path):
+        path = self._strip_protocol(path)
+        self.fs.delete_file(path)
+
+    @wrap_exceptions
+    def rm(self, path, recursive=False, maxdepth=None):
+        path = self._strip_protocol(path).rstrip("/")
+        if self.isdir(path):
+            if recursive:
+                self.fs.delete_dir(path)
+            else:
+                raise ValueError("Can't delete directories without recursive=False")
+        else:
+            self.fs.delete_file(path)
+
+    @wrap_exceptions
+    def _open(self, path, mode="rb", block_size=None, seekable=True, **kwargs):
+        if mode == "rb":
+            if seekable:
+                method = self.fs.open_input_file
+            else:
+                method = self.fs.open_input_stream
+        elif mode == "wb":
+            method = self.fs.open_output_stream
+        elif mode == "ab":
+            method = self.fs.open_append_stream
+        else:
+            raise ValueError(f"unsupported mode for Arrow filesystem: {mode!r}")
+
+        _kwargs = {}
+        if mode != "rb" or not seekable:
+            if int(PYARROW_VERSION.split(".")[0]) >= 4:
+                # disable compression auto-detection
+                _kwargs["compression"] = None
+        stream = method(path, **_kwargs)
+
+        return ArrowFile(self, stream, path, mode, block_size, **kwargs)
+
+    @wrap_exceptions
+    def mkdir(self, path, create_parents=True, **kwargs):
+        path = self._strip_protocol(path)
+        if create_parents:
+            self.makedirs(path, exist_ok=True)
+        else:
+            self.fs.create_dir(path, recursive=False)
+
+    @wrap_exceptions
+    def makedirs(self, path, exist_ok=False):
+        path = self._strip_protocol(path)
+        self.fs.create_dir(path, recursive=True)
+
+    @wrap_exceptions
+    def rmdir(self, path):
+        path = self._strip_protocol(path)
+        self.fs.delete_dir(path)
+
+    @wrap_exceptions
+    def modified(self, path):
+        path = self._strip_protocol(path)
+        return self.fs.get_file_info(path).mtime
+
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        kwargs["seekable"] = start not in [None, 0]
+        return super().cat_file(path, start=None, end=None, **kwargs)
+
+    def get_file(self, rpath, lpath, **kwargs):
+        kwargs["seekable"] = False
+        super().get_file(rpath, lpath, **kwargs)
+
+
+@mirror_from(
+    "stream",
+    [
+        "read",
+        "seek",
+        "tell",
+        "write",
+        "readable",
+        "writable",
+        "close",
+        "size",
+        "seekable",
+    ],
+)
+class ArrowFile(io.IOBase):
+    def __init__(self, fs, stream, path, mode, block_size=None, **kwargs):
+        self.path = path
+        self.mode = mode
+
+        self.fs = fs
+        self.stream = stream
+
+        self.blocksize = self.block_size = block_size
+        self.kwargs = kwargs
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        return self.close()
+
+
+class HadoopFileSystem(ArrowFSWrapper):
+    """A wrapper on top of the pyarrow.fs.HadoopFileSystem
+    to connect it's interface with fsspec"""
+
+    protocol = "hdfs"
+
+    def __init__(
+        self,
+        host="default",
+        port=0,
+        user=None,
+        kerb_ticket=None,
+        replication=3,
+        extra_conf=None,
+        **kwargs,
+    ):
+        """
+
+        Parameters
+        ----------
+        host: str
+            Hostname, IP or "default" to try to read from Hadoop config
+        port: int
+            Port to connect on, or default from Hadoop config if 0
+        user: str or None
+            If given, connect as this username
+        kerb_ticket: str or None
+            If given, use this ticket for authentication
+        replication: int
+            set replication factor of file for write operations. default value is 3.
+        extra_conf: None or dict
+            Passed on to HadoopFileSystem
+        """
+        from pyarrow.fs import HadoopFileSystem
+
+        fs = HadoopFileSystem(
+            host=host,
+            port=port,
+            user=user,
+            kerb_ticket=kerb_ticket,
+            replication=replication,
+            extra_conf=extra_conf,
+        )
+        super().__init__(fs=fs, **kwargs)
+
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        ops = infer_storage_options(path)
+        out = {}
+        if ops.get("host", None):
+            out["host"] = ops["host"]
+        if ops.get("username", None):
+            out["user"] = ops["username"]
+        if ops.get("port", None):
+            out["port"] = ops["port"]
+        if ops.get("url_query", None):
+            queries = parse_qs(ops["url_query"])
+            if queries.get("replication", None):
+                out["replication"] = int(queries["replication"][0])
+        return out
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_mapper.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e7c7d88afdddf12f77b26bb635bd8bf1e2bd7f1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_mapper.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+import abc
+import hashlib
+
+from fsspec.implementations.local import make_path_posix
+
+
+class AbstractCacheMapper(abc.ABC):
+    """Abstract super-class for mappers from remote URLs to local cached
+    basenames.
+    """
+
+    @abc.abstractmethod
+    def __call__(self, path: str) -> str: ...
+
+    def __eq__(self, other: object) -> bool:
+        # Identity only depends on class. When derived classes have attributes
+        # they will need to be included.
+        return isinstance(other, type(self))
+
+    def __hash__(self) -> int:
+        # Identity only depends on class. When derived classes have attributes
+        # they will need to be included.
+        return hash(type(self))
+
+
+class BasenameCacheMapper(AbstractCacheMapper):
+    """Cache mapper that uses the basename of the remote URL and a fixed number
+    of directory levels above this.
+
+    The default is zero directory levels, meaning different paths with the same
+    basename will have the same cached basename.
+    """
+
+    def __init__(self, directory_levels: int = 0):
+        if directory_levels < 0:
+            raise ValueError(
+                "BasenameCacheMapper requires zero or positive directory_levels"
+            )
+        self.directory_levels = directory_levels
+
+        # Separator for directories when encoded as strings.
+        self._separator = "_@_"
+
+    def __call__(self, path: str) -> str:
+        path = make_path_posix(path)
+        prefix, *bits = path.rsplit("/", self.directory_levels + 1)
+        if bits:
+            return self._separator.join(bits)
+        else:
+            return prefix  # No separator found, simple filename
+
+    def __eq__(self, other: object) -> bool:
+        return super().__eq__(other) and self.directory_levels == other.directory_levels
+
+    def __hash__(self) -> int:
+        return super().__hash__() ^ hash(self.directory_levels)
+
+
+class HashCacheMapper(AbstractCacheMapper):
+    """Cache mapper that uses a hash of the remote URL."""
+
+    def __call__(self, path: str) -> str:
+        return hashlib.sha256(path.encode()).hexdigest()
+
+
+def create_cache_mapper(same_names: bool) -> AbstractCacheMapper:
+    """Factory method to create cache mapper for backward compatibility with
+    ``CachingFileSystem`` constructor using ``same_names`` kwarg.
+    """
+    if same_names:
+        return BasenameCacheMapper()
+    else:
+        return HashCacheMapper()
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_metadata.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd9b5cdd99d7f4a0a989c0f7d0c70ddcf324816a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cache_metadata.py
@@ -0,0 +1,232 @@
+from __future__ import annotations
+
+import os
+import pickle
+import time
+from typing import TYPE_CHECKING
+
+from fsspec.utils import atomic_write
+
+try:
+    import ujson as json
+except ImportError:
+    if not TYPE_CHECKING:
+        import json
+
+if TYPE_CHECKING:
+    from typing import Any, Dict, Iterator, Literal
+
+    from typing_extensions import TypeAlias
+
+    from .cached import CachingFileSystem
+
+    Detail: TypeAlias = Dict[str, Any]
+
+
+class CacheMetadata:
+    """Cache metadata.
+
+    All reading and writing of cache metadata is performed by this class,
+    accessing the cached files and blocks is not.
+
+    Metadata is stored in a single file per storage directory in JSON format.
+    For backward compatibility, also reads metadata stored in pickle format
+    which is converted to JSON when next saved.
+    """
+
+    def __init__(self, storage: list[str]):
+        """
+
+        Parameters
+        ----------
+        storage: list[str]
+            Directories containing cached files, must be at least one. Metadata
+            is stored in the last of these directories by convention.
+        """
+        if not storage:
+            raise ValueError("CacheMetadata expects at least one storage location")
+
+        self._storage = storage
+        self.cached_files: list[Detail] = [{}]
+
+        # Private attribute to force saving of metadata in pickle format rather than
+        # JSON for use in tests to confirm can read both pickle and JSON formats.
+        self._force_save_pickle = False
+
+    def _load(self, fn: str) -> Detail:
+        """Low-level function to load metadata from specific file"""
+        try:
+            with open(fn, "r") as f:
+                loaded = json.load(f)
+        except ValueError:
+            with open(fn, "rb") as f:
+                loaded = pickle.load(f)
+        for c in loaded.values():
+            if isinstance(c.get("blocks"), list):
+                c["blocks"] = set(c["blocks"])
+        return loaded
+
+    def _save(self, metadata_to_save: Detail, fn: str) -> None:
+        """Low-level function to save metadata to specific file"""
+        if self._force_save_pickle:
+            with atomic_write(fn) as f:
+                pickle.dump(metadata_to_save, f)
+        else:
+            with atomic_write(fn, mode="w") as f:
+                json.dump(metadata_to_save, f)
+
+    def _scan_locations(
+        self, writable_only: bool = False
+    ) -> Iterator[tuple[str, str, bool]]:
+        """Yield locations (filenames) where metadata is stored, and whether
+        writable or not.
+
+        Parameters
+        ----------
+        writable: bool
+            Set to True to only yield writable locations.
+
+        Returns
+        -------
+        Yields (str, str, bool)
+        """
+        n = len(self._storage)
+        for i, storage in enumerate(self._storage):
+            writable = i == n - 1
+            if writable_only and not writable:
+                continue
+            yield os.path.join(storage, "cache"), storage, writable
+
+    def check_file(
+        self, path: str, cfs: CachingFileSystem | None
+    ) -> Literal[False] | tuple[Detail, str]:
+        """If path is in cache return its details, otherwise return ``False``.
+
+        If the optional CachingFileSystem is specified then it is used to
+        perform extra checks to reject possible matches, such as if they are
+        too old.
+        """
+        for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files):
+            if path not in cache:
+                continue
+            detail = cache[path].copy()
+
+            if cfs is not None:
+                if cfs.check_files and detail["uid"] != cfs.fs.ukey(path):
+                    # Wrong file as determined by hash of file properties
+                    continue
+                if cfs.expiry and time.time() - detail["time"] > cfs.expiry:
+                    # Cached file has expired
+                    continue
+
+            fn = os.path.join(base, detail["fn"])
+            if os.path.exists(fn):
+                return detail, fn
+        return False
+
+    def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]:
+        """Remove expired metadata from the cache.
+
+        Returns names of files corresponding to expired metadata and a boolean
+        flag indicating whether the writable cache is empty. Caller is
+        responsible for deleting the expired files.
+        """
+        expired_files = []
+        for path, detail in self.cached_files[-1].copy().items():
+            if time.time() - detail["time"] > expiry_time:
+                fn = detail.get("fn", "")
+                if not fn:
+                    raise RuntimeError(
+                        f"Cache metadata does not contain 'fn' for {path}"
+                    )
+                fn = os.path.join(self._storage[-1], fn)
+                expired_files.append(fn)
+                self.cached_files[-1].pop(path)
+
+        if self.cached_files[-1]:
+            cache_path = os.path.join(self._storage[-1], "cache")
+            self._save(self.cached_files[-1], cache_path)
+
+        writable_cache_empty = not self.cached_files[-1]
+        return expired_files, writable_cache_empty
+
+    def load(self) -> None:
+        """Load all metadata from disk and store in ``self.cached_files``"""
+        cached_files = []
+        for fn, _, _ in self._scan_locations():
+            if os.path.exists(fn):
+                # TODO: consolidate blocks here
+                cached_files.append(self._load(fn))
+            else:
+                cached_files.append({})
+        self.cached_files = cached_files or [{}]
+
+    def on_close_cached_file(self, f: Any, path: str) -> None:
+        """Perform side-effect actions on closing a cached file.
+
+        The actual closing of the file is the responsibility of the caller.
+        """
+        # File must be writeble, so in self.cached_files[-1]
+        c = self.cached_files[-1][path]
+        if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size:
+            c["blocks"] = True
+
+    def pop_file(self, path: str) -> str | None:
+        """Remove metadata of cached file.
+
+        If path is in the cache, return the filename of the cached file,
+        otherwise return ``None``.  Caller is responsible for deleting the
+        cached file.
+        """
+        details = self.check_file(path, None)
+        if not details:
+            return None
+        _, fn = details
+        if fn.startswith(self._storage[-1]):
+            self.cached_files[-1].pop(path)
+            self.save()
+        else:
+            raise PermissionError(
+                "Can only delete cached file in last, writable cache location"
+            )
+        return fn
+
+    def save(self) -> None:
+        """Save metadata to disk"""
+        for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files):
+            if not writable:
+                continue
+
+            if os.path.exists(fn):
+                cached_files = self._load(fn)
+                for k, c in cached_files.items():
+                    if k in cache:
+                        if c["blocks"] is True or cache[k]["blocks"] is True:
+                            c["blocks"] = True
+                        else:
+                            # self.cached_files[*][*]["blocks"] must continue to
+                            # point to the same set object so that updates
+                            # performed by MMapCache are propagated back to
+                            # self.cached_files.
+                            blocks = cache[k]["blocks"]
+                            blocks.update(c["blocks"])
+                            c["blocks"] = blocks
+                        c["time"] = max(c["time"], cache[k]["time"])
+                        c["uid"] = cache[k]["uid"]
+
+                # Files can be added to cache after it was written once
+                for k, c in cache.items():
+                    if k not in cached_files:
+                        cached_files[k] = c
+            else:
+                cached_files = cache
+            cache = {k: v.copy() for k, v in cached_files.items()}
+            for c in cache.values():
+                if isinstance(c["blocks"], set):
+                    c["blocks"] = list(c["blocks"])
+            self._save(cache, fn)
+            self.cached_files[-1] = cached_files
+
+    def update_file(self, path: str, detail: Detail) -> None:
+        """Update metadata for specific file in memory, do not save"""
+        self.cached_files[-1][path] = detail
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cached.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cached.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2ee17214a23e7d74ecf935449cf8923ad903ce1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/cached.py
@@ -0,0 +1,939 @@
+from __future__ import annotations
+
+import inspect
+import logging
+import os
+import tempfile
+import time
+import weakref
+from shutil import rmtree
+from typing import TYPE_CHECKING, Any, Callable, ClassVar
+
+from fsspec import AbstractFileSystem, filesystem
+from fsspec.callbacks import DEFAULT_CALLBACK
+from fsspec.compression import compr
+from fsspec.core import BaseCache, MMapCache
+from fsspec.exceptions import BlocksizeMismatchError
+from fsspec.implementations.cache_mapper import create_cache_mapper
+from fsspec.implementations.cache_metadata import CacheMetadata
+from fsspec.spec import AbstractBufferedFile
+from fsspec.transaction import Transaction
+from fsspec.utils import infer_compression
+
+if TYPE_CHECKING:
+    from fsspec.implementations.cache_mapper import AbstractCacheMapper
+
+logger = logging.getLogger("fsspec.cached")
+
+
+class WriteCachedTransaction(Transaction):
+    def complete(self, commit=True):
+        rpaths = [f.path for f in self.files]
+        lpaths = [f.fn for f in self.files]
+        if commit:
+            self.fs.put(lpaths, rpaths)
+        self.files.clear()
+        self.fs._intrans = False
+        self.fs._transaction = None
+        self.fs = None  # break cycle
+
+
+class CachingFileSystem(AbstractFileSystem):
+    """Locally caching filesystem, layer over any other FS
+
+    This class implements chunk-wise local storage of remote files, for quick
+    access after the initial download. The files are stored in a given
+    directory with hashes of URLs for the filenames. If no directory is given,
+    a temporary one is used, which should be cleaned up by the OS after the
+    process ends. The files themselves are sparse (as implemented in
+    :class:`~fsspec.caching.MMapCache`), so only the data which is accessed
+    takes up space.
+
+    Restrictions:
+
+    - the block-size must be the same for each access of a given file, unless
+      all blocks of the file have already been read
+    - caching can only be applied to file-systems which produce files
+      derived from fsspec.spec.AbstractBufferedFile ; LocalFileSystem is also
+      allowed, for testing
+    """
+
+    protocol: ClassVar[str | tuple[str, ...]] = ("blockcache", "cached")
+
+    def __init__(
+        self,
+        target_protocol=None,
+        cache_storage="TMP",
+        cache_check=10,
+        check_files=False,
+        expiry_time=604800,
+        target_options=None,
+        fs=None,
+        same_names: bool | None = None,
+        compression=None,
+        cache_mapper: AbstractCacheMapper | None = None,
+        **kwargs,
+    ):
+        """
+
+        Parameters
+        ----------
+        target_protocol: str (optional)
+            Target filesystem protocol. Provide either this or ``fs``.
+        cache_storage: str or list(str)
+            Location to store files. If "TMP", this is a temporary directory,
+            and will be cleaned up by the OS when this process ends (or later).
+            If a list, each location will be tried in the order given, but
+            only the last will be considered writable.
+        cache_check: int
+            Number of seconds between reload of cache metadata
+        check_files: bool
+            Whether to explicitly see if the UID of the remote file matches
+            the stored one before using. Warning: some file systems such as
+            HTTP cannot reliably give a unique hash of the contents of some
+            path, so be sure to set this option to False.
+        expiry_time: int
+            The time in seconds after which a local copy is considered useless.
+            Set to falsy to prevent expiry. The default is equivalent to one
+            week.
+        target_options: dict or None
+            Passed to the instantiation of the FS, if fs is None.
+        fs: filesystem instance
+            The target filesystem to run against. Provide this or ``protocol``.
+        same_names: bool (optional)
+            By default, target URLs are hashed using a ``HashCacheMapper`` so
+            that files from different backends with the same basename do not
+            conflict. If this argument is ``true``, a ``BasenameCacheMapper``
+            is used instead. Other cache mapper options are available by using
+            the ``cache_mapper`` keyword argument. Only one of this and
+            ``cache_mapper`` should be specified.
+        compression: str (optional)
+            To decompress on download. Can be 'infer' (guess from the URL name),
+            one of the entries in ``fsspec.compression.compr``, or None for no
+            decompression.
+        cache_mapper: AbstractCacheMapper (optional)
+            The object use to map from original filenames to cached filenames.
+            Only one of this and ``same_names`` should be specified.
+        """
+        super().__init__(**kwargs)
+        if fs is None and target_protocol is None:
+            raise ValueError(
+                "Please provide filesystem instance(fs) or target_protocol"
+            )
+        if not (fs is None) ^ (target_protocol is None):
+            raise ValueError(
+                "Both filesystems (fs) and target_protocol may not be both given."
+            )
+        if cache_storage == "TMP":
+            tempdir = tempfile.mkdtemp()
+            storage = [tempdir]
+            weakref.finalize(self, self._remove_tempdir, tempdir)
+        else:
+            if isinstance(cache_storage, str):
+                storage = [cache_storage]
+            else:
+                storage = cache_storage
+        os.makedirs(storage[-1], exist_ok=True)
+        self.storage = storage
+        self.kwargs = target_options or {}
+        self.cache_check = cache_check
+        self.check_files = check_files
+        self.expiry = expiry_time
+        self.compression = compression
+
+        # Size of cache in bytes. If None then the size is unknown and will be
+        # recalculated the next time cache_size() is called. On writes to the
+        # cache this is reset to None.
+        self._cache_size = None
+
+        if same_names is not None and cache_mapper is not None:
+            raise ValueError(
+                "Cannot specify both same_names and cache_mapper in "
+                "CachingFileSystem.__init__"
+            )
+        if cache_mapper is not None:
+            self._mapper = cache_mapper
+        else:
+            self._mapper = create_cache_mapper(
+                same_names if same_names is not None else False
+            )
+
+        self.target_protocol = (
+            target_protocol
+            if isinstance(target_protocol, str)
+            else (fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0])
+        )
+        self._metadata = CacheMetadata(self.storage)
+        self.load_cache()
+        self.fs = fs if fs is not None else filesystem(target_protocol, **self.kwargs)
+
+        def _strip_protocol(path):
+            # acts as a method, since each instance has a difference target
+            return self.fs._strip_protocol(type(self)._strip_protocol(path))
+
+        self._strip_protocol: Callable = _strip_protocol
+
+    @staticmethod
+    def _remove_tempdir(tempdir):
+        try:
+            rmtree(tempdir)
+        except Exception:
+            pass
+
+    def _mkcache(self):
+        os.makedirs(self.storage[-1], exist_ok=True)
+
+    def cache_size(self):
+        """Return size of cache in bytes.
+
+        If more than one cache directory is in use, only the size of the last
+        one (the writable cache directory) is returned.
+        """
+        if self._cache_size is None:
+            cache_dir = self.storage[-1]
+            self._cache_size = filesystem("file").du(cache_dir, withdirs=True)
+        return self._cache_size
+
+    def load_cache(self):
+        """Read set of stored blocks from file"""
+        self._metadata.load()
+        self._mkcache()
+        self.last_cache = time.time()
+
+    def save_cache(self):
+        """Save set of stored blocks from file"""
+        self._mkcache()
+        self._metadata.save()
+        self.last_cache = time.time()
+        self._cache_size = None
+
+    def _check_cache(self):
+        """Reload caches if time elapsed or any disappeared"""
+        self._mkcache()
+        if not self.cache_check:
+            # explicitly told not to bother checking
+            return
+        timecond = time.time() - self.last_cache > self.cache_check
+        existcond = all(os.path.exists(storage) for storage in self.storage)
+        if timecond or not existcond:
+            self.load_cache()
+
+    def _check_file(self, path):
+        """Is path in cache and still valid"""
+        path = self._strip_protocol(path)
+        self._check_cache()
+        return self._metadata.check_file(path, self)
+
+    def clear_cache(self):
+        """Remove all files and metadata from the cache
+
+        In the case of multiple cache locations, this clears only the last one,
+        which is assumed to be the read/write one.
+        """
+        rmtree(self.storage[-1])
+        self.load_cache()
+        self._cache_size = None
+
+    def clear_expired_cache(self, expiry_time=None):
+        """Remove all expired files and metadata from the cache
+
+        In the case of multiple cache locations, this clears only the last one,
+        which is assumed to be the read/write one.
+
+        Parameters
+        ----------
+        expiry_time: int
+            The time in seconds after which a local copy is considered useless.
+            If not defined the default is equivalent to the attribute from the
+            file caching instantiation.
+        """
+
+        if not expiry_time:
+            expiry_time = self.expiry
+
+        self._check_cache()
+
+        expired_files, writable_cache_empty = self._metadata.clear_expired(expiry_time)
+        for fn in expired_files:
+            if os.path.exists(fn):
+                os.remove(fn)
+
+        if writable_cache_empty:
+            rmtree(self.storage[-1])
+            self.load_cache()
+
+        self._cache_size = None
+
+    def pop_from_cache(self, path):
+        """Remove cached version of given file
+
+        Deletes local copy of the given (remote) path. If it is found in a cache
+        location which is not the last, it is assumed to be read-only, and
+        raises PermissionError
+        """
+        path = self._strip_protocol(path)
+        fn = self._metadata.pop_file(path)
+        if fn is not None:
+            os.remove(fn)
+        self._cache_size = None
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        """Wrap the target _open
+
+        If the whole file exists in the cache, just open it locally and
+        return that.
+
+        Otherwise, open the file on the target FS, and make it have a mmap
+        cache pointing to the location which we determine, in our cache.
+        The ``blocks`` instance is shared, so as the mmap cache instance
+        updates, so does the entry in our ``cached_files`` attribute.
+        We monkey-patch this file, so that when it closes, we call
+        ``close_and_update`` to save the state of the blocks.
+        """
+        path = self._strip_protocol(path)
+
+        path = self.fs._strip_protocol(path)
+        if "r" not in mode:
+            return self.fs._open(
+                path,
+                mode=mode,
+                block_size=block_size,
+                autocommit=autocommit,
+                cache_options=cache_options,
+                **kwargs,
+            )
+        detail = self._check_file(path)
+        if detail:
+            # file is in cache
+            detail, fn = detail
+            hash, blocks = detail["fn"], detail["blocks"]
+            if blocks is True:
+                # stored file is complete
+                logger.debug("Opening local copy of %s", path)
+                return open(fn, mode)
+            # TODO: action where partial file exists in read-only cache
+            logger.debug("Opening partially cached copy of %s", path)
+        else:
+            hash = self._mapper(path)
+            fn = os.path.join(self.storage[-1], hash)
+            blocks = set()
+            detail = {
+                "original": path,
+                "fn": hash,
+                "blocks": blocks,
+                "time": time.time(),
+                "uid": self.fs.ukey(path),
+            }
+            self._metadata.update_file(path, detail)
+            logger.debug("Creating local sparse file for %s", path)
+
+        # call target filesystems open
+        self._mkcache()
+        f = self.fs._open(
+            path,
+            mode=mode,
+            block_size=block_size,
+            autocommit=autocommit,
+            cache_options=cache_options,
+            cache_type="none",
+            **kwargs,
+        )
+        if self.compression:
+            comp = (
+                infer_compression(path)
+                if self.compression == "infer"
+                else self.compression
+            )
+            f = compr[comp](f, mode="rb")
+        if "blocksize" in detail:
+            if detail["blocksize"] != f.blocksize:
+                raise BlocksizeMismatchError(
+                    f"Cached file must be reopened with same block"
+                    f" size as original (old: {detail['blocksize']},"
+                    f" new {f.blocksize})"
+                )
+        else:
+            detail["blocksize"] = f.blocksize
+        f.cache = MMapCache(f.blocksize, f._fetch_range, f.size, fn, blocks)
+        close = f.close
+        f.close = lambda: self.close_and_update(f, close)
+        self.save_cache()
+        return f
+
+    def _parent(self, path):
+        return self.fs._parent(path)
+
+    def hash_name(self, path: str, *args: Any) -> str:
+        # Kept for backward compatibility with downstream libraries.
+        # Ignores extra arguments, previously same_name boolean.
+        return self._mapper(path)
+
+    def close_and_update(self, f, close):
+        """Called when a file is closing, so store the set of blocks"""
+        if f.closed:
+            return
+        path = self._strip_protocol(f.path)
+        self._metadata.on_close_cached_file(f, path)
+        try:
+            logger.debug("going to save")
+            self.save_cache()
+            logger.debug("saved")
+        except OSError:
+            logger.debug("Cache saving failed while closing file")
+        except NameError:
+            logger.debug("Cache save failed due to interpreter shutdown")
+        close()
+        f.closed = True
+
+    def ls(self, path, detail=True):
+        return self.fs.ls(path, detail)
+
+    def __getattribute__(self, item):
+        if item in {
+            "load_cache",
+            "_open",
+            "save_cache",
+            "close_and_update",
+            "__init__",
+            "__getattribute__",
+            "__reduce__",
+            "_make_local_details",
+            "open",
+            "cat",
+            "cat_file",
+            "cat_ranges",
+            "get",
+            "read_block",
+            "tail",
+            "head",
+            "info",
+            "ls",
+            "exists",
+            "isfile",
+            "isdir",
+            "_check_file",
+            "_check_cache",
+            "_mkcache",
+            "clear_cache",
+            "clear_expired_cache",
+            "pop_from_cache",
+            "_mkcache",
+            "local_file",
+            "_paths_from_path",
+            "get_mapper",
+            "open_many",
+            "commit_many",
+            "hash_name",
+            "__hash__",
+            "__eq__",
+            "to_json",
+            "cache_size",
+            "pipe_file",
+            "pipe",
+            "isdir",
+            "isfile",
+            "exists",
+            "start_transaction",
+            "end_transaction",
+        }:
+            # all the methods defined in this class. Note `open` here, since
+            # it calls `_open`, but is actually in superclass
+            return lambda *args, **kw: getattr(type(self), item).__get__(self)(
+                *args, **kw
+            )
+        if item in ["__reduce_ex__"]:
+            raise AttributeError
+        if item in ["transaction"]:
+            # property
+            return type(self).transaction.__get__(self)
+        if item in ["_cache", "transaction_type"]:
+            # class attributes
+            return getattr(type(self), item)
+        if item == "__class__":
+            return type(self)
+        d = object.__getattribute__(self, "__dict__")
+        fs = d.get("fs", None)  # fs is not immediately defined
+        if item in d:
+            return d[item]
+        elif fs is not None:
+            if item in fs.__dict__:
+                # attribute of instance
+                return fs.__dict__[item]
+            # attributed belonging to the target filesystem
+            cls = type(fs)
+            m = getattr(cls, item)
+            if (inspect.isfunction(m) or inspect.isdatadescriptor(m)) and (
+                not hasattr(m, "__self__") or m.__self__ is None
+            ):
+                # instance method
+                return m.__get__(fs, cls)
+            return m  # class method or attribute
+        else:
+            # attributes of the superclass, while target is being set up
+            return super().__getattribute__(item)
+
+    def __eq__(self, other):
+        """Test for equality."""
+        if self is other:
+            return True
+        if not isinstance(other, type(self)):
+            return False
+        return (
+            self.storage == other.storage
+            and self.kwargs == other.kwargs
+            and self.cache_check == other.cache_check
+            and self.check_files == other.check_files
+            and self.expiry == other.expiry
+            and self.compression == other.compression
+            and self._mapper == other._mapper
+            and self.target_protocol == other.target_protocol
+        )
+
+    def __hash__(self):
+        """Calculate hash."""
+        return (
+            hash(tuple(self.storage))
+            ^ hash(str(self.kwargs))
+            ^ hash(self.cache_check)
+            ^ hash(self.check_files)
+            ^ hash(self.expiry)
+            ^ hash(self.compression)
+            ^ hash(self._mapper)
+            ^ hash(self.target_protocol)
+        )
+
+    def to_json(self):
+        """Calculate JSON representation.
+
+        Not implemented yet for CachingFileSystem.
+        """
+        raise NotImplementedError(
+            "CachingFileSystem JSON representation not implemented"
+        )
+
+
+class WholeFileCacheFileSystem(CachingFileSystem):
+    """Caches whole remote files on first access
+
+    This class is intended as a layer over any other file system, and
+    will make a local copy of each file accessed, so that all subsequent
+    reads are local. This is similar to ``CachingFileSystem``, but without
+    the block-wise functionality and so can work even when sparse files
+    are not allowed. See its docstring for definition of the init
+    arguments.
+
+    The class still needs access to the remote store for listing files,
+    and may refresh cached files.
+    """
+
+    protocol = "filecache"
+    local_file = True
+
+    def open_many(self, open_files, **kwargs):
+        paths = [of.path for of in open_files]
+        if "r" in open_files.mode:
+            self._mkcache()
+        else:
+            return [
+                LocalTempFile(
+                    self.fs,
+                    path,
+                    mode=open_files.mode,
+                    fn=os.path.join(self.storage[-1], self._mapper(path)),
+                    **kwargs,
+                )
+                for path in paths
+            ]
+
+        if self.compression:
+            raise NotImplementedError
+        details = [self._check_file(sp) for sp in paths]
+        downpath = [p for p, d in zip(paths, details) if not d]
+        downfn0 = [
+            os.path.join(self.storage[-1], self._mapper(p))
+            for p, d in zip(paths, details)
+        ]  # keep these path names for opening later
+        downfn = [fn for fn, d in zip(downfn0, details) if not d]
+        if downpath:
+            # skip if all files are already cached and up to date
+            self.fs.get(downpath, downfn)
+
+            # update metadata - only happens when downloads are successful
+            newdetail = [
+                {
+                    "original": path,
+                    "fn": self._mapper(path),
+                    "blocks": True,
+                    "time": time.time(),
+                    "uid": self.fs.ukey(path),
+                }
+                for path in downpath
+            ]
+            for path, detail in zip(downpath, newdetail):
+                self._metadata.update_file(path, detail)
+            self.save_cache()
+
+        def firstpart(fn):
+            # helper to adapt both whole-file and simple-cache
+            return fn[1] if isinstance(fn, tuple) else fn
+
+        return [
+            open(firstpart(fn0) if fn0 else fn1, mode=open_files.mode)
+            for fn0, fn1 in zip(details, downfn0)
+        ]
+
+    def commit_many(self, open_files):
+        self.fs.put([f.fn for f in open_files], [f.path for f in open_files])
+        [f.close() for f in open_files]
+        for f in open_files:
+            # in case autocommit is off, and so close did not already delete
+            try:
+                os.remove(f.name)
+            except FileNotFoundError:
+                pass
+        self._cache_size = None
+
+    def _make_local_details(self, path):
+        hash = self._mapper(path)
+        fn = os.path.join(self.storage[-1], hash)
+        detail = {
+            "original": path,
+            "fn": hash,
+            "blocks": True,
+            "time": time.time(),
+            "uid": self.fs.ukey(path),
+        }
+        self._metadata.update_file(path, detail)
+        logger.debug("Copying %s to local cache", path)
+        return fn
+
+    def cat(
+        self,
+        path,
+        recursive=False,
+        on_error="raise",
+        callback=DEFAULT_CALLBACK,
+        **kwargs,
+    ):
+        paths = self.expand_path(
+            path, recursive=recursive, maxdepth=kwargs.get("maxdepth", None)
+        )
+        getpaths = []
+        storepaths = []
+        fns = []
+        out = {}
+        for p in paths.copy():
+            try:
+                detail = self._check_file(p)
+                if not detail:
+                    fn = self._make_local_details(p)
+                    getpaths.append(p)
+                    storepaths.append(fn)
+                else:
+                    detail, fn = detail if isinstance(detail, tuple) else (None, detail)
+                fns.append(fn)
+            except Exception as e:
+                if on_error == "raise":
+                    raise
+                if on_error == "return":
+                    out[p] = e
+                paths.remove(p)
+
+        if getpaths:
+            self.fs.get(getpaths, storepaths)
+            self.save_cache()
+
+        callback.set_size(len(paths))
+        for p, fn in zip(paths, fns):
+            with open(fn, "rb") as f:
+                out[p] = f.read()
+            callback.relative_update(1)
+        if isinstance(path, str) and len(paths) == 1 and recursive is False:
+            out = out[paths[0]]
+        return out
+
+    def _open(self, path, mode="rb", **kwargs):
+        path = self._strip_protocol(path)
+        if "r" not in mode:
+            fn = self._make_local_details(path)
+            user_specified_kwargs = {
+                k: v
+                for k, v in kwargs.items()
+                # those kwargs were added by open(), we don't want them
+                if k not in ["autocommit", "block_size", "cache_options"]
+            }
+            return LocalTempFile(self, path, mode=mode, fn=fn, **user_specified_kwargs)
+        detail = self._check_file(path)
+        if detail:
+            detail, fn = detail
+            _, blocks = detail["fn"], detail["blocks"]
+            if blocks is True:
+                logger.debug("Opening local copy of %s", path)
+
+                # In order to support downstream filesystems to be able to
+                # infer the compression from the original filename, like
+                # the `TarFileSystem`, let's extend the `io.BufferedReader`
+                # fileobject protocol by adding a dedicated attribute
+                # `original`.
+                f = open(fn, mode)
+                f.original = detail.get("original")
+                return f
+            else:
+                raise ValueError(
+                    f"Attempt to open partially cached file {path}"
+                    f" as a wholly cached file"
+                )
+        else:
+            fn = self._make_local_details(path)
+        kwargs["mode"] = mode
+
+        # call target filesystems open
+        self._mkcache()
+        if self.compression:
+            with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
+                if isinstance(f, AbstractBufferedFile):
+                    # want no type of caching if just downloading whole thing
+                    f.cache = BaseCache(0, f.cache.fetcher, f.size)
+                comp = (
+                    infer_compression(path)
+                    if self.compression == "infer"
+                    else self.compression
+                )
+                f = compr[comp](f, mode="rb")
+                data = True
+                while data:
+                    block = getattr(f, "blocksize", 5 * 2**20)
+                    data = f.read(block)
+                    f2.write(data)
+        else:
+            self.fs.get_file(path, fn)
+        self.save_cache()
+        return self._open(path, mode)
+
+
+class SimpleCacheFileSystem(WholeFileCacheFileSystem):
+    """Caches whole remote files on first access
+
+    This class is intended as a layer over any other file system, and
+    will make a local copy of each file accessed, so that all subsequent
+    reads are local. This implementation only copies whole files, and
+    does not keep any metadata about the download time or file details.
+    It is therefore safer to use in multi-threaded/concurrent situations.
+
+    This is the only of the caching filesystems that supports write: you will
+    be given a real local open file, and upon close and commit, it will be
+    uploaded to the target filesystem; the writability or the target URL is
+    not checked until that time.
+
+    """
+
+    protocol = "simplecache"
+    local_file = True
+    transaction_type = WriteCachedTransaction
+
+    def __init__(self, **kwargs):
+        kw = kwargs.copy()
+        for key in ["cache_check", "expiry_time", "check_files"]:
+            kw[key] = False
+        super().__init__(**kw)
+        for storage in self.storage:
+            if not os.path.exists(storage):
+                os.makedirs(storage, exist_ok=True)
+
+    def _check_file(self, path):
+        self._check_cache()
+        sha = self._mapper(path)
+        for storage in self.storage:
+            fn = os.path.join(storage, sha)
+            if os.path.exists(fn):
+                return fn
+
+    def save_cache(self):
+        pass
+
+    def load_cache(self):
+        pass
+
+    def pipe_file(self, path, value=None, **kwargs):
+        if self._intrans:
+            with self.open(path, "wb") as f:
+                f.write(value)
+        else:
+            super().pipe_file(path, value)
+
+    def ls(self, path, detail=True, **kwargs):
+        path = self._strip_protocol(path)
+        details = []
+        try:
+            details = self.fs.ls(
+                path, detail=True, **kwargs
+            ).copy()  # don't edit original!
+        except FileNotFoundError as e:
+            ex = e
+        else:
+            ex = None
+        if self._intrans:
+            path1 = path.rstrip("/") + "/"
+            for f in self.transaction.files:
+                if f.path == path:
+                    details.append(
+                        {"name": path, "size": f.size or f.tell(), "type": "file"}
+                    )
+                elif f.path.startswith(path1):
+                    if f.path.count("/") == path1.count("/"):
+                        details.append(
+                            {"name": f.path, "size": f.size or f.tell(), "type": "file"}
+                        )
+                    else:
+                        dname = "/".join(f.path.split("/")[: path1.count("/") + 1])
+                        details.append({"name": dname, "size": 0, "type": "directory"})
+        if ex is not None and not details:
+            raise ex
+        if detail:
+            return details
+        return sorted(_["name"] for _ in details)
+
+    def info(self, path, **kwargs):
+        path = self._strip_protocol(path)
+        if self._intrans:
+            f = [_ for _ in self.transaction.files if _.path == path]
+            if f:
+                return {"name": path, "size": f[0].size or f[0].tell(), "type": "file"}
+            f = any(_.path.startswith(path + "/") for _ in self.transaction.files)
+            if f:
+                return {"name": path, "size": 0, "type": "directory"}
+        return self.fs.info(path, **kwargs)
+
+    def pipe(self, path, value=None, **kwargs):
+        if isinstance(path, str):
+            self.pipe_file(self._strip_protocol(path), value, **kwargs)
+        elif isinstance(path, dict):
+            for k, v in path.items():
+                self.pipe_file(self._strip_protocol(k), v, **kwargs)
+        else:
+            raise ValueError("path must be str or dict")
+
+    def cat_ranges(
+        self, paths, starts, ends, max_gap=None, on_error="return", **kwargs
+    ):
+        lpaths = [self._check_file(p) for p in paths]
+        rpaths = [p for l, p in zip(lpaths, paths) if l is False]
+        lpaths = [l for l, p in zip(lpaths, paths) if l is False]
+        self.fs.get(rpaths, lpaths)
+        return super().cat_ranges(
+            paths, starts, ends, max_gap=max_gap, on_error=on_error, **kwargs
+        )
+
+    def _open(self, path, mode="rb", **kwargs):
+        path = self._strip_protocol(path)
+        sha = self._mapper(path)
+
+        if "r" not in mode:
+            fn = os.path.join(self.storage[-1], sha)
+            user_specified_kwargs = {
+                k: v
+                for k, v in kwargs.items()
+                if k not in ["autocommit", "block_size", "cache_options"]
+            }  # those were added by open()
+            return LocalTempFile(
+                self,
+                path,
+                mode=mode,
+                autocommit=not self._intrans,
+                fn=fn,
+                **user_specified_kwargs,
+            )
+        fn = self._check_file(path)
+        if fn:
+            return open(fn, mode)
+
+        fn = os.path.join(self.storage[-1], sha)
+        logger.debug("Copying %s to local cache", path)
+        kwargs["mode"] = mode
+
+        self._mkcache()
+        self._cache_size = None
+        if self.compression:
+            with self.fs._open(path, **kwargs) as f, open(fn, "wb") as f2:
+                if isinstance(f, AbstractBufferedFile):
+                    # want no type of caching if just downloading whole thing
+                    f.cache = BaseCache(0, f.cache.fetcher, f.size)
+                comp = (
+                    infer_compression(path)
+                    if self.compression == "infer"
+                    else self.compression
+                )
+                f = compr[comp](f, mode="rb")
+                data = True
+                while data:
+                    block = getattr(f, "blocksize", 5 * 2**20)
+                    data = f.read(block)
+                    f2.write(data)
+        else:
+            self.fs.get_file(path, fn)
+        return self._open(path, mode)
+
+
+class LocalTempFile:
+    """A temporary local file, which will be uploaded on commit"""
+
+    def __init__(self, fs, path, fn, mode="wb", autocommit=True, seek=0, **kwargs):
+        self.fn = fn
+        self.fh = open(fn, mode)
+        self.mode = mode
+        if seek:
+            self.fh.seek(seek)
+        self.path = path
+        self.size = None
+        self.fs = fs
+        self.closed = False
+        self.autocommit = autocommit
+        self.kwargs = kwargs
+
+    def __reduce__(self):
+        # always open in r+b to allow continuing writing at a location
+        return (
+            LocalTempFile,
+            (self.fs, self.path, self.fn, "r+b", self.autocommit, self.tell()),
+        )
+
+    def __enter__(self):
+        return self.fh
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+
+    def close(self):
+        self.size = self.fh.tell()
+        if self.closed:
+            return
+        self.fh.close()
+        self.closed = True
+        if self.autocommit:
+            self.commit()
+
+    def discard(self):
+        self.fh.close()
+        os.remove(self.fn)
+
+    def commit(self):
+        self.fs.put(self.fn, self.path, **self.kwargs)
+        # we do not delete local copy - it's still in the cache
+
+    @property
+    def name(self):
+        return self.fn
+
+    def __repr__(self) -> str:
+        return f"LocalTempFile: {self.path}"
+
+    def __getattr__(self, item):
+        return getattr(self.fh, item)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dask.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dask.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e1276463db6866665e6a0fe114efc247971b57e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dask.py
@@ -0,0 +1,152 @@
+import dask
+from distributed.client import Client, _get_global_client
+from distributed.worker import Worker
+
+from fsspec import filesystem
+from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
+from fsspec.utils import infer_storage_options
+
+
+def _get_client(client):
+    if client is None:
+        return _get_global_client()
+    elif isinstance(client, Client):
+        return client
+    else:
+        # e.g., connection string
+        return Client(client)
+
+
+def _in_worker():
+    return bool(Worker._instances)
+
+
+class DaskWorkerFileSystem(AbstractFileSystem):
+    """View files accessible to a worker as any other remote file-system
+
+    When instances are run on the worker, uses the real filesystem. When
+    run on the client, they call the worker to provide information or data.
+
+    **Warning** this implementation is experimental, and read-only for now.
+    """
+
+    def __init__(
+        self, target_protocol=None, target_options=None, fs=None, client=None, **kwargs
+    ):
+        super().__init__(**kwargs)
+        if not (fs is None) ^ (target_protocol is None):
+            raise ValueError(
+                "Please provide one of filesystem instance (fs) or"
+                " target_protocol, not both"
+            )
+        self.target_protocol = target_protocol
+        self.target_options = target_options
+        self.worker = None
+        self.client = client
+        self.fs = fs
+        self._determine_worker()
+
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        so = infer_storage_options(path)
+        if "host" in so and "port" in so:
+            return {"client": f"{so['host']}:{so['port']}"}
+        else:
+            return {}
+
+    def _determine_worker(self):
+        if _in_worker():
+            self.worker = True
+            if self.fs is None:
+                self.fs = filesystem(
+                    self.target_protocol, **(self.target_options or {})
+                )
+        else:
+            self.worker = False
+            self.client = _get_client(self.client)
+            self.rfs = dask.delayed(self)
+
+    def mkdir(self, *args, **kwargs):
+        if self.worker:
+            self.fs.mkdir(*args, **kwargs)
+        else:
+            self.rfs.mkdir(*args, **kwargs).compute()
+
+    def rm(self, *args, **kwargs):
+        if self.worker:
+            self.fs.rm(*args, **kwargs)
+        else:
+            self.rfs.rm(*args, **kwargs).compute()
+
+    def copy(self, *args, **kwargs):
+        if self.worker:
+            self.fs.copy(*args, **kwargs)
+        else:
+            self.rfs.copy(*args, **kwargs).compute()
+
+    def mv(self, *args, **kwargs):
+        if self.worker:
+            self.fs.mv(*args, **kwargs)
+        else:
+            self.rfs.mv(*args, **kwargs).compute()
+
+    def ls(self, *args, **kwargs):
+        if self.worker:
+            return self.fs.ls(*args, **kwargs)
+        else:
+            return self.rfs.ls(*args, **kwargs).compute()
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        if self.worker:
+            return self.fs._open(
+                path,
+                mode=mode,
+                block_size=block_size,
+                autocommit=autocommit,
+                cache_options=cache_options,
+                **kwargs,
+            )
+        else:
+            return DaskFile(
+                fs=self,
+                path=path,
+                mode=mode,
+                block_size=block_size,
+                autocommit=autocommit,
+                cache_options=cache_options,
+                **kwargs,
+            )
+
+    def fetch_range(self, path, mode, start, end):
+        if self.worker:
+            with self._open(path, mode) as f:
+                f.seek(start)
+                return f.read(end - start)
+        else:
+            return self.rfs.fetch_range(path, mode, start, end).compute()
+
+
+class DaskFile(AbstractBufferedFile):
+    def __init__(self, mode="rb", **kwargs):
+        if mode != "rb":
+            raise ValueError('Remote dask files can only be opened in "rb" mode')
+        super().__init__(**kwargs)
+
+    def _upload_chunk(self, final=False):
+        pass
+
+    def _initiate_upload(self):
+        """Create remote file/upload"""
+        pass
+
+    def _fetch_range(self, start, end):
+        """Get the specified set of bytes from remote"""
+        return self.fs.fetch_range(self.path, self.mode, start, end)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/data.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..519032305bed633f2ba8a6148076433caf81710b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/data.py
@@ -0,0 +1,58 @@
+import base64
+import io
+from typing import Optional
+from urllib.parse import unquote
+
+from fsspec import AbstractFileSystem
+
+
+class DataFileSystem(AbstractFileSystem):
+    """A handy decoder for data-URLs
+
+    Example
+    -------
+    >>> with fsspec.open("data:,Hello%2C%20World%21") as f:
+    ...     print(f.read())
+    b"Hello, World!"
+
+    See https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs
+    """
+
+    protocol = "data"
+
+    def __init__(self, **kwargs):
+        """No parameters for this filesystem"""
+        super().__init__(**kwargs)
+
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        pref, data = path.split(",", 1)
+        if pref.endswith("base64"):
+            return base64.b64decode(data)[start:end]
+        return unquote(data).encode()[start:end]
+
+    def info(self, path, **kwargs):
+        pref, name = path.split(",", 1)
+        data = self.cat_file(path)
+        mime = pref.split(":", 1)[1].split(";", 1)[0]
+        return {"name": name, "size": len(data), "type": "file", "mimetype": mime}
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        if "r" not in mode:
+            raise ValueError("Read only filesystem")
+        return io.BytesIO(self.cat_file(path))
+
+    @staticmethod
+    def encode(data: bytes, mime: Optional[str] = None):
+        """Format the given data into data-URL syntax
+
+        This version always base64 encodes, even when the data is ascii/url-safe.
+        """
+        return f"data:{mime or ''};base64,{base64.b64encode(data).decode()}"
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dbfs.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dbfs.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce9f9eadb798577970ee95530743b4521813ca7c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dbfs.py
@@ -0,0 +1,467 @@
+import base64
+import urllib
+
+import requests
+import requests.exceptions
+from requests.adapters import HTTPAdapter, Retry
+
+from fsspec import AbstractFileSystem
+from fsspec.spec import AbstractBufferedFile
+
+
+class DatabricksException(Exception):
+    """
+    Helper class for exceptions raised in this module.
+    """
+
+    def __init__(self, error_code, message):
+        """Create a new DatabricksException"""
+        super().__init__(message)
+
+        self.error_code = error_code
+        self.message = message
+
+
+class DatabricksFileSystem(AbstractFileSystem):
+    """
+    Get access to the Databricks filesystem implementation over HTTP.
+    Can be used inside and outside of a databricks cluster.
+    """
+
+    def __init__(self, instance, token, **kwargs):
+        """
+        Create a new DatabricksFileSystem.
+
+        Parameters
+        ----------
+        instance: str
+            The instance URL of the databricks cluster.
+            For example for an Azure databricks cluster, this
+            has the form adb-<some-number>.<two digits>.azuredatabricks.net.
+        token: str
+            Your personal token. Find out more
+            here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
+        """
+        self.instance = instance
+        self.token = token
+        self.session = requests.Session()
+        self.retries = Retry(
+            total=10,
+            backoff_factor=0.05,
+            status_forcelist=[408, 429, 500, 502, 503, 504],
+        )
+
+        self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
+        self.session.headers.update({"Authorization": f"Bearer {self.token}"})
+
+        super().__init__(**kwargs)
+
+    def ls(self, path, detail=True, **kwargs):
+        """
+        List the contents of the given path.
+
+        Parameters
+        ----------
+        path: str
+            Absolute path
+        detail: bool
+            Return not only the list of filenames,
+            but also additional information on file sizes
+            and types.
+        """
+        out = self._ls_from_cache(path)
+        if not out:
+            try:
+                r = self._send_to_api(
+                    method="get", endpoint="list", json={"path": path}
+                )
+            except DatabricksException as e:
+                if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                    raise FileNotFoundError(e.message)
+
+                raise e
+            files = r["files"]
+            out = [
+                {
+                    "name": o["path"],
+                    "type": "directory" if o["is_dir"] else "file",
+                    "size": o["file_size"],
+                }
+                for o in files
+            ]
+            self.dircache[path] = out
+
+        if detail:
+            return out
+        return [o["name"] for o in out]
+
+    def makedirs(self, path, exist_ok=True):
+        """
+        Create a given absolute path and all of its parents.
+
+        Parameters
+        ----------
+        path: str
+            Absolute path to create
+        exist_ok: bool
+            If false, checks if the folder
+            exists before creating it (and raises an
+            Exception if this is the case)
+        """
+        if not exist_ok:
+            try:
+                # If the following succeeds, the path is already present
+                self._send_to_api(
+                    method="get", endpoint="get-status", json={"path": path}
+                )
+                raise FileExistsError(f"Path {path} already exists")
+            except DatabricksException as e:
+                if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                    pass
+
+        try:
+            self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_ALREADY_EXISTS":
+                raise FileExistsError(e.message)
+
+            raise e
+        self.invalidate_cache(self._parent(path))
+
+    def mkdir(self, path, create_parents=True, **kwargs):
+        """
+        Create a given absolute path and all of its parents.
+
+        Parameters
+        ----------
+        path: str
+            Absolute path to create
+        create_parents: bool
+            Whether to create all parents or not.
+            "False" is not implemented so far.
+        """
+        if not create_parents:
+            raise NotImplementedError
+
+        self.mkdirs(path, **kwargs)
+
+    def rm(self, path, recursive=False, **kwargs):
+        """
+        Remove the file or folder at the given absolute path.
+
+        Parameters
+        ----------
+        path: str
+            Absolute path what to remove
+        recursive: bool
+            Recursively delete all files in a folder.
+        """
+        try:
+            self._send_to_api(
+                method="post",
+                endpoint="delete",
+                json={"path": path, "recursive": recursive},
+            )
+        except DatabricksException as e:
+            # This is not really an exception, it just means
+            # not everything was deleted so far
+            if e.error_code == "PARTIAL_DELETE":
+                self.rm(path=path, recursive=recursive)
+            elif e.error_code == "IO_ERROR":
+                # Using the same exception as the os module would use here
+                raise OSError(e.message)
+
+            raise e
+        self.invalidate_cache(self._parent(path))
+
+    def mv(
+        self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
+    ):
+        """
+        Move a source to a destination path.
+
+        A note from the original [databricks API manual]
+        (https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
+
+        When moving a large number of files the API call will time out after
+        approximately 60s, potentially resulting in partially moved data.
+        Therefore, for operations that move more than 10k files, we strongly
+        discourage using the DBFS REST API.
+
+        Parameters
+        ----------
+        source_path: str
+            From where to move (absolute path)
+        destination_path: str
+            To where to move (absolute path)
+        recursive: bool
+            Not implemented to far.
+        maxdepth:
+            Not implemented to far.
+        """
+        if recursive:
+            raise NotImplementedError
+        if maxdepth:
+            raise NotImplementedError
+
+        try:
+            self._send_to_api(
+                method="post",
+                endpoint="move",
+                json={"source_path": source_path, "destination_path": destination_path},
+            )
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                raise FileNotFoundError(e.message)
+            elif e.error_code == "RESOURCE_ALREADY_EXISTS":
+                raise FileExistsError(e.message)
+
+            raise e
+        self.invalidate_cache(self._parent(source_path))
+        self.invalidate_cache(self._parent(destination_path))
+
+    def _open(self, path, mode="rb", block_size="default", **kwargs):
+        """
+        Overwrite the base class method to make sure to create a DBFile.
+        All arguments are copied from the base method.
+
+        Only the default blocksize is allowed.
+        """
+        return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
+
+    def _send_to_api(self, method, endpoint, json):
+        """
+        Send the given json to the DBFS API
+        using a get or post request (specified by the argument `method`).
+
+        Parameters
+        ----------
+        method: str
+            Which http method to use for communication; "get" or "post".
+        endpoint: str
+            Where to send the request to (last part of the API URL)
+        json: dict
+            Dictionary of information to send
+        """
+        if method == "post":
+            session_call = self.session.post
+        elif method == "get":
+            session_call = self.session.get
+        else:
+            raise ValueError(f"Do not understand method {method}")
+
+        url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
+
+        r = session_call(url, json=json)
+
+        # The DBFS API will return a json, also in case of an exception.
+        # We want to preserve this information as good as possible.
+        try:
+            r.raise_for_status()
+        except requests.HTTPError as e:
+            # try to extract json error message
+            # if that fails, fall back to the original exception
+            try:
+                exception_json = e.response.json()
+            except Exception:
+                raise e
+
+            raise DatabricksException(**exception_json)
+
+        return r.json()
+
+    def _create_handle(self, path, overwrite=True):
+        """
+        Internal function to create a handle, which can be used to
+        write blocks of a file to DBFS.
+        A handle has a unique identifier which needs to be passed
+        whenever written during this transaction.
+        The handle is active for 10 minutes - after that a new
+        write transaction needs to be created.
+        Make sure to close the handle after you are finished.
+
+        Parameters
+        ----------
+        path: str
+            Absolute path for this file.
+        overwrite: bool
+            If a file already exist at this location, either overwrite
+            it or raise an exception.
+        """
+        try:
+            r = self._send_to_api(
+                method="post",
+                endpoint="create",
+                json={"path": path, "overwrite": overwrite},
+            )
+            return r["handle"]
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_ALREADY_EXISTS":
+                raise FileExistsError(e.message)
+
+            raise e
+
+    def _close_handle(self, handle):
+        """
+        Close a handle, which was opened by :func:`_create_handle`.
+
+        Parameters
+        ----------
+        handle: str
+            Which handle to close.
+        """
+        try:
+            self._send_to_api(method="post", endpoint="close", json={"handle": handle})
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                raise FileNotFoundError(e.message)
+
+            raise e
+
+    def _add_data(self, handle, data):
+        """
+        Upload data to an already opened file handle
+        (opened by :func:`_create_handle`).
+        The maximal allowed data size is 1MB after
+        conversion to base64.
+        Remember to close the handle when you are finished.
+
+        Parameters
+        ----------
+        handle: str
+            Which handle to upload data to.
+        data: bytes
+            Block of data to add to the handle.
+        """
+        data = base64.b64encode(data).decode()
+        try:
+            self._send_to_api(
+                method="post",
+                endpoint="add-block",
+                json={"handle": handle, "data": data},
+            )
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                raise FileNotFoundError(e.message)
+            elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
+                raise ValueError(e.message)
+
+            raise e
+
+    def _get_data(self, path, start, end):
+        """
+        Download data in bytes from a given absolute path in a block
+        from [start, start+length].
+        The maximum number of allowed bytes to read is 1MB.
+
+        Parameters
+        ----------
+        path: str
+            Absolute path to download data from
+        start: int
+            Start position of the block
+        end: int
+            End position of the block
+        """
+        try:
+            r = self._send_to_api(
+                method="get",
+                endpoint="read",
+                json={"path": path, "offset": start, "length": end - start},
+            )
+            return base64.b64decode(r["data"])
+        except DatabricksException as e:
+            if e.error_code == "RESOURCE_DOES_NOT_EXIST":
+                raise FileNotFoundError(e.message)
+            elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
+                raise ValueError(e.message)
+
+            raise e
+
+    def invalidate_cache(self, path=None):
+        if path is None:
+            self.dircache.clear()
+        else:
+            self.dircache.pop(path, None)
+        super().invalidate_cache(path)
+
+
+class DatabricksFile(AbstractBufferedFile):
+    """
+    Helper class for files referenced in the DatabricksFileSystem.
+    """
+
+    DEFAULT_BLOCK_SIZE = 1 * 2**20  # only allowed block size
+
+    def __init__(
+        self,
+        fs,
+        path,
+        mode="rb",
+        block_size="default",
+        autocommit=True,
+        cache_type="readahead",
+        cache_options=None,
+        **kwargs,
+    ):
+        """
+        Create a new instance of the DatabricksFile.
+
+        The blocksize needs to be the default one.
+        """
+        if block_size is None or block_size == "default":
+            block_size = self.DEFAULT_BLOCK_SIZE
+
+        assert (
+            block_size == self.DEFAULT_BLOCK_SIZE
+        ), f"Only the default block size is allowed, not {block_size}"
+
+        super().__init__(
+            fs,
+            path,
+            mode=mode,
+            block_size=block_size,
+            autocommit=autocommit,
+            cache_type=cache_type,
+            cache_options=cache_options or {},
+            **kwargs,
+        )
+
+    def _initiate_upload(self):
+        """Internal function to start a file upload"""
+        self.handle = self.fs._create_handle(self.path)
+
+    def _upload_chunk(self, final=False):
+        """Internal function to add a chunk of data to a started upload"""
+        self.buffer.seek(0)
+        data = self.buffer.getvalue()
+
+        data_chunks = [
+            data[start:end] for start, end in self._to_sized_blocks(len(data))
+        ]
+
+        for data_chunk in data_chunks:
+            self.fs._add_data(handle=self.handle, data=data_chunk)
+
+        if final:
+            self.fs._close_handle(handle=self.handle)
+            return True
+
+    def _fetch_range(self, start, end):
+        """Internal function to download a block of data"""
+        return_buffer = b""
+        length = end - start
+        for chunk_start, chunk_end in self._to_sized_blocks(length, start):
+            return_buffer += self.fs._get_data(
+                path=self.path, start=chunk_start, end=chunk_end
+            )
+
+        return return_buffer
+
+    def _to_sized_blocks(self, length, start=0):
+        """Helper function to split a range from 0 to total_length into bloksizes"""
+        end = start + length
+        for data_chunk in range(start, end, self.blocksize):
+            data_start = data_chunk
+            data_end = min(end, data_chunk + self.blocksize)
+            yield data_start, data_end
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dirfs.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dirfs.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b311ab2a1b53bd59fd785d3e4615aab6474c073
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/dirfs.py
@@ -0,0 +1,364 @@
+from .. import filesystem
+from ..asyn import AsyncFileSystem
+
+
+class DirFileSystem(AsyncFileSystem):
+    """Directory prefix filesystem
+
+    The DirFileSystem is a filesystem-wrapper. It assumes every path it is dealing with
+    is relative to the `path`. After performing the necessary paths operation it
+    delegates everything to the wrapped filesystem.
+    """
+
+    protocol = "dir"
+
+    def __init__(
+        self,
+        path=None,
+        fs=None,
+        fo=None,
+        target_protocol=None,
+        target_options=None,
+        **storage_options,
+    ):
+        """
+        Parameters
+        ----------
+        path: str
+            Path to the directory.
+        fs: AbstractFileSystem
+            An instantiated filesystem to wrap.
+        target_protocol, target_options:
+            if fs is none, construct it from these
+        fo: str
+            Alternate for path; do not provide both
+        """
+        super().__init__(**storage_options)
+        if fs is None:
+            fs = filesystem(protocol=target_protocol, **(target_options or {}))
+        if (path is not None) ^ (fo is not None) is False:
+            raise ValueError("Provide path or fo, not both")
+        path = path or fo
+
+        if self.asynchronous and not fs.async_impl:
+            raise ValueError("can't use asynchronous with non-async fs")
+
+        if fs.async_impl and self.asynchronous != fs.asynchronous:
+            raise ValueError("both dirfs and fs should be in the same sync/async mode")
+
+        self.path = fs._strip_protocol(path)
+        self.fs = fs
+
+    def _join(self, path):
+        if isinstance(path, str):
+            if not self.path:
+                return path
+            if not path:
+                return self.path
+            return self.fs.sep.join((self.path, self._strip_protocol(path)))
+        return [self._join(_path) for _path in path]
+
+    def _relpath(self, path):
+        if isinstance(path, str):
+            if not self.path:
+                return path
+            if path == self.path:
+                return ""
+            prefix = self.path + self.fs.sep
+            assert path.startswith(prefix)
+            return path[len(prefix) :]
+        return [self._relpath(_path) for _path in path]
+
+    # Wrappers below
+
+    @property
+    def sep(self):
+        return self.fs.sep
+
+    async def set_session(self, *args, **kwargs):
+        return await self.fs.set_session(*args, **kwargs)
+
+    async def _rm_file(self, path, **kwargs):
+        return await self.fs._rm_file(self._join(path), **kwargs)
+
+    def rm_file(self, path, **kwargs):
+        return self.fs.rm_file(self._join(path), **kwargs)
+
+    async def _rm(self, path, *args, **kwargs):
+        return await self.fs._rm(self._join(path), *args, **kwargs)
+
+    def rm(self, path, *args, **kwargs):
+        return self.fs.rm(self._join(path), *args, **kwargs)
+
+    async def _cp_file(self, path1, path2, **kwargs):
+        return await self.fs._cp_file(self._join(path1), self._join(path2), **kwargs)
+
+    def cp_file(self, path1, path2, **kwargs):
+        return self.fs.cp_file(self._join(path1), self._join(path2), **kwargs)
+
+    async def _copy(
+        self,
+        path1,
+        path2,
+        *args,
+        **kwargs,
+    ):
+        return await self.fs._copy(
+            self._join(path1),
+            self._join(path2),
+            *args,
+            **kwargs,
+        )
+
+    def copy(self, path1, path2, *args, **kwargs):
+        return self.fs.copy(
+            self._join(path1),
+            self._join(path2),
+            *args,
+            **kwargs,
+        )
+
+    async def _pipe(self, path, *args, **kwargs):
+        return await self.fs._pipe(self._join(path), *args, **kwargs)
+
+    def pipe(self, path, *args, **kwargs):
+        return self.fs.pipe(self._join(path), *args, **kwargs)
+
+    async def _pipe_file(self, path, *args, **kwargs):
+        return await self.fs._pipe_file(self._join(path), *args, **kwargs)
+
+    def pipe_file(self, path, *args, **kwargs):
+        return self.fs.pipe_file(self._join(path), *args, **kwargs)
+
+    async def _cat_file(self, path, *args, **kwargs):
+        return await self.fs._cat_file(self._join(path), *args, **kwargs)
+
+    def cat_file(self, path, *args, **kwargs):
+        return self.fs.cat_file(self._join(path), *args, **kwargs)
+
+    async def _cat(self, path, *args, **kwargs):
+        ret = await self.fs._cat(
+            self._join(path),
+            *args,
+            **kwargs,
+        )
+
+        if isinstance(ret, dict):
+            return {self._relpath(key): value for key, value in ret.items()}
+
+        return ret
+
+    def cat(self, path, *args, **kwargs):
+        ret = self.fs.cat(
+            self._join(path),
+            *args,
+            **kwargs,
+        )
+
+        if isinstance(ret, dict):
+            return {self._relpath(key): value for key, value in ret.items()}
+
+        return ret
+
+    async def _put_file(self, lpath, rpath, **kwargs):
+        return await self.fs._put_file(lpath, self._join(rpath), **kwargs)
+
+    def put_file(self, lpath, rpath, **kwargs):
+        return self.fs.put_file(lpath, self._join(rpath), **kwargs)
+
+    async def _put(
+        self,
+        lpath,
+        rpath,
+        *args,
+        **kwargs,
+    ):
+        return await self.fs._put(
+            lpath,
+            self._join(rpath),
+            *args,
+            **kwargs,
+        )
+
+    def put(self, lpath, rpath, *args, **kwargs):
+        return self.fs.put(
+            lpath,
+            self._join(rpath),
+            *args,
+            **kwargs,
+        )
+
+    async def _get_file(self, rpath, lpath, **kwargs):
+        return await self.fs._get_file(self._join(rpath), lpath, **kwargs)
+
+    def get_file(self, rpath, lpath, **kwargs):
+        return self.fs.get_file(self._join(rpath), lpath, **kwargs)
+
+    async def _get(self, rpath, *args, **kwargs):
+        return await self.fs._get(self._join(rpath), *args, **kwargs)
+
+    def get(self, rpath, *args, **kwargs):
+        return self.fs.get(self._join(rpath), *args, **kwargs)
+
+    async def _isfile(self, path):
+        return await self.fs._isfile(self._join(path))
+
+    def isfile(self, path):
+        return self.fs.isfile(self._join(path))
+
+    async def _isdir(self, path):
+        return await self.fs._isdir(self._join(path))
+
+    def isdir(self, path):
+        return self.fs.isdir(self._join(path))
+
+    async def _size(self, path):
+        return await self.fs._size(self._join(path))
+
+    def size(self, path):
+        return self.fs.size(self._join(path))
+
+    async def _exists(self, path):
+        return await self.fs._exists(self._join(path))
+
+    def exists(self, path):
+        return self.fs.exists(self._join(path))
+
+    async def _info(self, path, **kwargs):
+        return await self.fs._info(self._join(path), **kwargs)
+
+    def info(self, path, **kwargs):
+        return self.fs.info(self._join(path), **kwargs)
+
+    async def _ls(self, path, detail=True, **kwargs):
+        ret = (await self.fs._ls(self._join(path), detail=detail, **kwargs)).copy()
+        if detail:
+            out = []
+            for entry in ret:
+                entry = entry.copy()
+                entry["name"] = self._relpath(entry["name"])
+                out.append(entry)
+            return out
+
+        return self._relpath(ret)
+
+    def ls(self, path, detail=True, **kwargs):
+        ret = self.fs.ls(self._join(path), detail=detail, **kwargs).copy()
+        if detail:
+            out = []
+            for entry in ret:
+                entry = entry.copy()
+                entry["name"] = self._relpath(entry["name"])
+                out.append(entry)
+            return out
+
+        return self._relpath(ret)
+
+    async def _walk(self, path, *args, **kwargs):
+        async for root, dirs, files in self.fs._walk(self._join(path), *args, **kwargs):
+            yield self._relpath(root), dirs, files
+
+    def walk(self, path, *args, **kwargs):
+        for root, dirs, files in self.fs.walk(self._join(path), *args, **kwargs):
+            yield self._relpath(root), dirs, files
+
+    async def _glob(self, path, **kwargs):
+        detail = kwargs.get("detail", False)
+        ret = await self.fs._glob(self._join(path), **kwargs)
+        if detail:
+            return {self._relpath(path): info for path, info in ret.items()}
+        return self._relpath(ret)
+
+    def glob(self, path, **kwargs):
+        detail = kwargs.get("detail", False)
+        ret = self.fs.glob(self._join(path), **kwargs)
+        if detail:
+            return {self._relpath(path): info for path, info in ret.items()}
+        return self._relpath(ret)
+
+    async def _du(self, path, *args, **kwargs):
+        total = kwargs.get("total", True)
+        ret = await self.fs._du(self._join(path), *args, **kwargs)
+        if total:
+            return ret
+
+        return {self._relpath(path): size for path, size in ret.items()}
+
+    def du(self, path, *args, **kwargs):
+        total = kwargs.get("total", True)
+        ret = self.fs.du(self._join(path), *args, **kwargs)
+        if total:
+            return ret
+
+        return {self._relpath(path): size for path, size in ret.items()}
+
+    async def _find(self, path, *args, **kwargs):
+        detail = kwargs.get("detail", False)
+        ret = await self.fs._find(self._join(path), *args, **kwargs)
+        if detail:
+            return {self._relpath(path): info for path, info in ret.items()}
+        return self._relpath(ret)
+
+    def find(self, path, *args, **kwargs):
+        detail = kwargs.get("detail", False)
+        ret = self.fs.find(self._join(path), *args, **kwargs)
+        if detail:
+            return {self._relpath(path): info for path, info in ret.items()}
+        return self._relpath(ret)
+
+    async def _expand_path(self, path, *args, **kwargs):
+        return self._relpath(
+            await self.fs._expand_path(self._join(path), *args, **kwargs)
+        )
+
+    def expand_path(self, path, *args, **kwargs):
+        return self._relpath(self.fs.expand_path(self._join(path), *args, **kwargs))
+
+    async def _mkdir(self, path, *args, **kwargs):
+        return await self.fs._mkdir(self._join(path), *args, **kwargs)
+
+    def mkdir(self, path, *args, **kwargs):
+        return self.fs.mkdir(self._join(path), *args, **kwargs)
+
+    async def _makedirs(self, path, *args, **kwargs):
+        return await self.fs._makedirs(self._join(path), *args, **kwargs)
+
+    def makedirs(self, path, *args, **kwargs):
+        return self.fs.makedirs(self._join(path), *args, **kwargs)
+
+    def rmdir(self, path):
+        return self.fs.rmdir(self._join(path))
+
+    def mv(self, path1, path2, **kwargs):
+        return self.fs.mv(
+            self._join(path1),
+            self._join(path2),
+            **kwargs,
+        )
+
+    def touch(self, path, **kwargs):
+        return self.fs.touch(self._join(path), **kwargs)
+
+    def created(self, path):
+        return self.fs.created(self._join(path))
+
+    def modified(self, path):
+        return self.fs.modified(self._join(path))
+
+    def sign(self, path, *args, **kwargs):
+        return self.fs.sign(self._join(path), *args, **kwargs)
+
+    def __repr__(self):
+        return f"{self.__class__.__qualname__}(path='{self.path}', fs={self.fs})"
+
+    def open(
+        self,
+        path,
+        *args,
+        **kwargs,
+    ):
+        return self.fs.open(
+            self._join(path),
+            *args,
+            **kwargs,
+        )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/ftp.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/ftp.py
new file mode 100644
index 0000000000000000000000000000000000000000..415f4844952f362188561a3e41425d364a115400
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/ftp.py
@@ -0,0 +1,385 @@
+import os
+import sys
+import uuid
+import warnings
+from ftplib import FTP, Error, error_perm
+from typing import Any
+
+from ..spec import AbstractBufferedFile, AbstractFileSystem
+from ..utils import infer_storage_options, isfilelike
+
+
+class FTPFileSystem(AbstractFileSystem):
+    """A filesystem over classic FTP"""
+
+    root_marker = "/"
+    cachable = False
+    protocol = "ftp"
+
+    def __init__(
+        self,
+        host,
+        port=21,
+        username=None,
+        password=None,
+        acct=None,
+        block_size=None,
+        tempdir=None,
+        timeout=30,
+        encoding="utf-8",
+        **kwargs,
+    ):
+        """
+        You can use _get_kwargs_from_urls to get some kwargs from
+        a reasonable FTP url.
+
+        Authentication will be anonymous if username/password are not
+        given.
+
+        Parameters
+        ----------
+        host: str
+            The remote server name/ip to connect to
+        port: int
+            Port to connect with
+        username: str or None
+            If authenticating, the user's identifier
+        password: str of None
+            User's password on the server, if using
+        acct: str or None
+            Some servers also need an "account" string for auth
+        block_size: int or None
+            If given, the read-ahead or write buffer size.
+        tempdir: str
+            Directory on remote to put temporary files when in a transaction
+        timeout: int
+            Timeout of the ftp connection in seconds
+        encoding: str
+            Encoding to use for directories and filenames in FTP connection
+        """
+        super().__init__(**kwargs)
+        self.host = host
+        self.port = port
+        self.tempdir = tempdir or "/tmp"
+        self.cred = username, password, acct
+        self.timeout = timeout
+        self.encoding = encoding
+        if block_size is not None:
+            self.blocksize = block_size
+        else:
+            self.blocksize = 2**16
+        self._connect()
+
+    def _connect(self):
+        if sys.version_info >= (3, 9):
+            self.ftp = FTP(timeout=self.timeout, encoding=self.encoding)
+        elif self.encoding:
+            warnings.warn("`encoding` not supported for python<3.9, ignoring")
+            self.ftp = FTP(timeout=self.timeout)
+        else:
+            self.ftp = FTP(timeout=self.timeout)
+        self.ftp.connect(self.host, self.port)
+        self.ftp.login(*self.cred)
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        return "/" + infer_storage_options(path)["path"].lstrip("/").rstrip("/")
+
+    @staticmethod
+    def _get_kwargs_from_urls(urlpath):
+        out = infer_storage_options(urlpath)
+        out.pop("path", None)
+        out.pop("protocol", None)
+        return out
+
+    def ls(self, path, detail=True, **kwargs):
+        path = self._strip_protocol(path)
+        out = []
+        if path not in self.dircache:
+            try:
+                try:
+                    out = [
+                        (fn, details)
+                        for (fn, details) in self.ftp.mlsd(path)
+                        if fn not in [".", ".."]
+                        and details["type"] not in ["pdir", "cdir"]
+                    ]
+                except error_perm:
+                    out = _mlsd2(self.ftp, path)  # Not platform independent
+                for fn, details in out:
+                    if path == "/":
+                        path = ""  # just for forming the names, below
+                    details["name"] = "/".join([path, fn.lstrip("/")])
+                    if details["type"] == "file":
+                        details["size"] = int(details["size"])
+                    else:
+                        details["size"] = 0
+                    if details["type"] == "dir":
+                        details["type"] = "directory"
+                self.dircache[path] = out
+            except Error:
+                try:
+                    info = self.info(path)
+                    if info["type"] == "file":
+                        out = [(path, info)]
+                except (Error, IndexError):
+                    raise FileNotFoundError(path)
+        files = self.dircache.get(path, out)
+        if not detail:
+            return sorted([fn for fn, details in files])
+        return [details for fn, details in files]
+
+    def info(self, path, **kwargs):
+        # implement with direct method
+        path = self._strip_protocol(path)
+        if path == "/":
+            # special case, since this dir has no real entry
+            return {"name": "/", "size": 0, "type": "directory"}
+        files = self.ls(self._parent(path).lstrip("/"), True)
+        try:
+            out = [f for f in files if f["name"] == path][0]
+        except IndexError:
+            raise FileNotFoundError(path)
+        return out
+
+    def get_file(self, rpath, lpath, **kwargs):
+        if self.isdir(rpath):
+            if not os.path.exists(lpath):
+                os.mkdir(lpath)
+            return
+        if isfilelike(lpath):
+            outfile = lpath
+        else:
+            outfile = open(lpath, "wb")
+
+        def cb(x):
+            outfile.write(x)
+
+        self.ftp.retrbinary(
+            f"RETR {rpath}",
+            blocksize=self.blocksize,
+            callback=cb,
+        )
+        if not isfilelike(lpath):
+            outfile.close()
+
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        if end is not None:
+            return super().cat_file(path, start, end, **kwargs)
+        out = []
+
+        def cb(x):
+            out.append(x)
+
+        try:
+            self.ftp.retrbinary(
+                f"RETR {path}",
+                blocksize=self.blocksize,
+                rest=start,
+                callback=cb,
+            )
+        except (Error, error_perm) as orig_exc:
+            raise FileNotFoundError(path) from orig_exc
+        return b"".join(out)
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        cache_options=None,
+        autocommit=True,
+        **kwargs,
+    ):
+        path = self._strip_protocol(path)
+        block_size = block_size or self.blocksize
+        return FTPFile(
+            self,
+            path,
+            mode=mode,
+            block_size=block_size,
+            tempdir=self.tempdir,
+            autocommit=autocommit,
+            cache_options=cache_options,
+        )
+
+    def _rm(self, path):
+        path = self._strip_protocol(path)
+        self.ftp.delete(path)
+        self.invalidate_cache(self._parent(path))
+
+    def rm(self, path, recursive=False, maxdepth=None):
+        paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
+        for p in reversed(paths):
+            if self.isfile(p):
+                self.rm_file(p)
+            else:
+                self.rmdir(p)
+
+    def mkdir(self, path: str, create_parents: bool = True, **kwargs: Any) -> None:
+        path = self._strip_protocol(path)
+        parent = self._parent(path)
+        if parent != self.root_marker and not self.exists(parent) and create_parents:
+            self.mkdir(parent, create_parents=create_parents)
+
+        self.ftp.mkd(path)
+        self.invalidate_cache(self._parent(path))
+
+    def makedirs(self, path: str, exist_ok: bool = False) -> None:
+        path = self._strip_protocol(path)
+        if self.exists(path):
+            # NB: "/" does not "exist" as it has no directory entry
+            if not exist_ok:
+                raise FileExistsError(f"{path} exists without `exist_ok`")
+            # exists_ok=True -> no-op
+        else:
+            self.mkdir(path, create_parents=True)
+
+    def rmdir(self, path):
+        path = self._strip_protocol(path)
+        self.ftp.rmd(path)
+        self.invalidate_cache(self._parent(path))
+
+    def mv(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1)
+        path2 = self._strip_protocol(path2)
+        self.ftp.rename(path1, path2)
+        self.invalidate_cache(self._parent(path1))
+        self.invalidate_cache(self._parent(path2))
+
+    def __del__(self):
+        self.ftp.close()
+
+    def invalidate_cache(self, path=None):
+        if path is None:
+            self.dircache.clear()
+        else:
+            self.dircache.pop(path, None)
+        super().invalidate_cache(path)
+
+
+class TransferDone(Exception):
+    """Internal exception to break out of transfer"""
+
+    pass
+
+
+class FTPFile(AbstractBufferedFile):
+    """Interact with a remote FTP file with read/write buffering"""
+
+    def __init__(
+        self,
+        fs,
+        path,
+        mode="rb",
+        block_size="default",
+        autocommit=True,
+        cache_type="readahead",
+        cache_options=None,
+        **kwargs,
+    ):
+        super().__init__(
+            fs,
+            path,
+            mode=mode,
+            block_size=block_size,
+            autocommit=autocommit,
+            cache_type=cache_type,
+            cache_options=cache_options,
+            **kwargs,
+        )
+        if not autocommit:
+            self.target = self.path
+            self.path = "/".join([kwargs["tempdir"], str(uuid.uuid4())])
+
+    def commit(self):
+        self.fs.mv(self.path, self.target)
+
+    def discard(self):
+        self.fs.rm(self.path)
+
+    def _fetch_range(self, start, end):
+        """Get bytes between given byte limits
+
+        Implemented by raising an exception in the fetch callback when the
+        number of bytes received reaches the requested amount.
+
+        Will fail if the server does not respect the REST command on
+        retrieve requests.
+        """
+        out = []
+        total = [0]
+
+        def callback(x):
+            total[0] += len(x)
+            if total[0] > end - start:
+                out.append(x[: (end - start) - total[0]])
+                if end < self.size:
+                    raise TransferDone
+            else:
+                out.append(x)
+
+            if total[0] == end - start and end < self.size:
+                raise TransferDone
+
+        try:
+            self.fs.ftp.retrbinary(
+                f"RETR {self.path}",
+                blocksize=self.blocksize,
+                rest=start,
+                callback=callback,
+            )
+        except TransferDone:
+            try:
+                # stop transfer, we got enough bytes for this block
+                self.fs.ftp.abort()
+                self.fs.ftp.getmultiline()
+            except Error:
+                self.fs._connect()
+
+        return b"".join(out)
+
+    def _upload_chunk(self, final=False):
+        self.buffer.seek(0)
+        self.fs.ftp.storbinary(
+            f"STOR {self.path}", self.buffer, blocksize=self.blocksize, rest=self.offset
+        )
+        return True
+
+
+def _mlsd2(ftp, path="."):
+    """
+    Fall back to using `dir` instead of `mlsd` if not supported.
+
+    This parses a Linux style `ls -l` response to `dir`, but the response may
+    be platform dependent.
+
+    Parameters
+    ----------
+    ftp: ftplib.FTP
+    path: str
+        Expects to be given path, but defaults to ".".
+    """
+    lines = []
+    minfo = []
+    ftp.dir(path, lines.append)
+    for line in lines:
+        split_line = line.split()
+        if len(split_line) < 9:
+            continue
+        this = (
+            split_line[-1],
+            {
+                "modify": " ".join(split_line[5:8]),
+                "unix.owner": split_line[2],
+                "unix.group": split_line[3],
+                "unix.mode": split_line[0],
+                "size": split_line[4],
+            },
+        )
+        if "d" == this[1]["unix.mode"][0]:
+            this[1]["type"] = "dir"
+        else:
+            this[1]["type"] = "file"
+        minfo.append(this)
+    return minfo
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/git.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/git.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c34d93e08c20fc65421e5aa4bab53e8c683fee7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/git.py
@@ -0,0 +1,127 @@
+import os
+
+import pygit2
+
+from fsspec.spec import AbstractFileSystem
+
+from .memory import MemoryFile
+
+
+class GitFileSystem(AbstractFileSystem):
+    """Browse the files of a local git repo at any hash/tag/branch
+
+    (experimental backend)
+    """
+
+    root_marker = ""
+    cachable = True
+
+    def __init__(self, path=None, fo=None, ref=None, **kwargs):
+        """
+
+        Parameters
+        ----------
+        path: str (optional)
+            Local location of the repo (uses current directory if not given).
+            May be deprecated in favour of ``fo``. When used with a higher
+            level function such as fsspec.open(), may be of the form
+            "git://[path-to-repo[:]][ref@]path/to/file" (but the actual
+            file path should not contain "@" or ":").
+        fo: str (optional)
+            Same as ``path``, but passed as part of a chained URL. This one
+            takes precedence if both are given.
+        ref: str (optional)
+            Reference to work with, could be a hash, tag or branch name. Defaults
+            to current working tree. Note that ``ls`` and ``open`` also take hash,
+            so this becomes the default for those operations
+        kwargs
+        """
+        super().__init__(**kwargs)
+        self.repo = pygit2.Repository(fo or path or os.getcwd())
+        self.ref = ref or "master"
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        path = super()._strip_protocol(path).lstrip("/")
+        if ":" in path:
+            path = path.split(":", 1)[1]
+        if "@" in path:
+            path = path.split("@", 1)[1]
+        return path.lstrip("/")
+
+    def _path_to_object(self, path, ref):
+        comm, ref = self.repo.resolve_refish(ref or self.ref)
+        parts = path.split("/")
+        tree = comm.tree
+        for part in parts:
+            if part and isinstance(tree, pygit2.Tree):
+                tree = tree[part]
+        return tree
+
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        if path.startswith("git://"):
+            path = path[6:]
+        out = {}
+        if ":" in path:
+            out["path"], path = path.split(":", 1)
+        if "@" in path:
+            out["ref"], path = path.split("@", 1)
+        return out
+
+    def ls(self, path, detail=True, ref=None, **kwargs):
+        path = self._strip_protocol(path)
+        tree = self._path_to_object(path, ref)
+        if isinstance(tree, pygit2.Tree):
+            out = []
+            for obj in tree:
+                if isinstance(obj, pygit2.Tree):
+                    out.append(
+                        {
+                            "type": "directory",
+                            "name": "/".join([path, obj.name]).lstrip("/"),
+                            "hex": obj.hex,
+                            "mode": f"{obj.filemode:o}",
+                            "size": 0,
+                        }
+                    )
+                else:
+                    out.append(
+                        {
+                            "type": "file",
+                            "name": "/".join([path, obj.name]).lstrip("/"),
+                            "hex": obj.hex,
+                            "mode": f"{obj.filemode:o}",
+                            "size": obj.size,
+                        }
+                    )
+        else:
+            obj = tree
+            out = [
+                {
+                    "type": "file",
+                    "name": obj.name,
+                    "hex": obj.hex,
+                    "mode": f"{obj.filemode:o}",
+                    "size": obj.size,
+                }
+            ]
+        if detail:
+            return out
+        return [o["name"] for o in out]
+
+    def ukey(self, path, ref=None):
+        return self.info(path, ref=ref)["hex"]
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        ref=None,
+        **kwargs,
+    ):
+        obj = self._path_to_object(path, ref or self.ref)
+        return MemoryFile(data=obj.data)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/github.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/github.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8fe7a2f67d7d9648e75b82d5f0b5cf2bf7a4868
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/github.py
@@ -0,0 +1,227 @@
+import requests
+
+from ..spec import AbstractFileSystem
+from ..utils import infer_storage_options
+from .memory import MemoryFile
+
+# TODO: add GIST backend, would be very similar
+
+
+class GithubFileSystem(AbstractFileSystem):
+    """Interface to files in github
+
+    An instance of this class provides the files residing within a remote github
+    repository. You may specify a point in the repos history, by SHA, branch
+    or tag (default is current master).
+
+    Given that code files tend to be small, and that github does not support
+    retrieving partial content, we always fetch whole files.
+
+    When using fsspec.open, allows URIs of the form:
+
+    - "github://path/file", in which case you must specify org, repo and
+      may specify sha in the extra args
+    - 'github://org:repo@/precip/catalog.yml', where the org and repo are
+      part of the URI
+    - 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
+
+    ``sha`` can be the full or abbreviated hex of the commit you want to fetch
+    from, or a branch or tag name (so long as it doesn't contain special characters
+    like "/", "?", which would have to be HTTP-encoded).
+
+    For authorised access, you must provide username and token, which can be made
+    at https://github.com/settings/tokens
+    """
+
+    url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
+    rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
+    protocol = "github"
+    timeout = (60, 60)  # connect, read timeouts
+
+    def __init__(
+        self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.org = org
+        self.repo = repo
+        if (username is None) ^ (token is None):
+            raise ValueError("Auth required both username and token")
+        self.username = username
+        self.token = token
+        if timeout is not None:
+            self.timeout = timeout
+        if sha is None:
+            # look up default branch (not necessarily "master")
+            u = "https://api.github.com/repos/{org}/{repo}"
+            r = requests.get(
+                u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
+            )
+            r.raise_for_status()
+            sha = r.json()["default_branch"]
+
+        self.root = sha
+        self.ls("")
+
+    @property
+    def kw(self):
+        if self.username:
+            return {"auth": (self.username, self.token)}
+        return {}
+
+    @classmethod
+    def repos(cls, org_or_user, is_org=True):
+        """List repo names for given org or user
+
+        This may become the top level of the FS
+
+        Parameters
+        ----------
+        org_or_user: str
+            Name of the github org or user to query
+        is_org: bool (default True)
+            Whether the name is an organisation (True) or user (False)
+
+        Returns
+        -------
+        List of string
+        """
+        r = requests.get(
+            f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
+            timeout=cls.timeout,
+        )
+        r.raise_for_status()
+        return [repo["name"] for repo in r.json()]
+
+    @property
+    def tags(self):
+        """Names of tags in the repo"""
+        r = requests.get(
+            f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
+            timeout=self.timeout,
+            **self.kw,
+        )
+        r.raise_for_status()
+        return [t["name"] for t in r.json()]
+
+    @property
+    def branches(self):
+        """Names of branches in the repo"""
+        r = requests.get(
+            f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
+            timeout=self.timeout,
+            **self.kw,
+        )
+        r.raise_for_status()
+        return [t["name"] for t in r.json()]
+
+    @property
+    def refs(self):
+        """Named references, tags and branches"""
+        return {"tags": self.tags, "branches": self.branches}
+
+    def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
+        """List files at given path
+
+        Parameters
+        ----------
+        path: str
+            Location to list, relative to repo root
+        detail: bool
+            If True, returns list of dicts, one per file; if False, returns
+            list of full filenames only
+        sha: str (optional)
+            List at the given point in the repo history, branch or tag name or commit
+            SHA
+        _sha: str (optional)
+            List this specific tree object (used internally to descend into trees)
+        """
+        path = self._strip_protocol(path)
+        if path == "":
+            _sha = sha or self.root
+        if _sha is None:
+            parts = path.rstrip("/").split("/")
+            so_far = ""
+            _sha = sha or self.root
+            for part in parts:
+                out = self.ls(so_far, True, sha=sha, _sha=_sha)
+                so_far += "/" + part if so_far else part
+                out = [o for o in out if o["name"] == so_far]
+                if not out:
+                    raise FileNotFoundError(path)
+                out = out[0]
+                if out["type"] == "file":
+                    if detail:
+                        return [out]
+                    else:
+                        return path
+                _sha = out["sha"]
+        if path not in self.dircache or sha not in [self.root, None]:
+            r = requests.get(
+                self.url.format(org=self.org, repo=self.repo, sha=_sha),
+                timeout=self.timeout,
+                **self.kw,
+            )
+            if r.status_code == 404:
+                raise FileNotFoundError(path)
+            r.raise_for_status()
+            types = {"blob": "file", "tree": "directory"}
+            out = [
+                {
+                    "name": path + "/" + f["path"] if path else f["path"],
+                    "mode": f["mode"],
+                    "type": types[f["type"]],
+                    "size": f.get("size", 0),
+                    "sha": f["sha"],
+                }
+                for f in r.json()["tree"]
+                if f["type"] in types
+            ]
+            if sha in [self.root, None]:
+                self.dircache[path] = out
+        else:
+            out = self.dircache[path]
+        if detail:
+            return out
+        else:
+            return sorted([f["name"] for f in out])
+
+    def invalidate_cache(self, path=None):
+        self.dircache.clear()
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        opts = infer_storage_options(path)
+        if "username" not in opts:
+            return super()._strip_protocol(path)
+        return opts["path"].lstrip("/")
+
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        opts = infer_storage_options(path)
+        if "username" not in opts:
+            return {}
+        out = {"org": opts["username"], "repo": opts["password"]}
+        if opts["host"]:
+            out["sha"] = opts["host"]
+        return out
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        sha=None,
+        **kwargs,
+    ):
+        if mode != "rb":
+            raise NotImplementedError
+        url = self.rurl.format(
+            org=self.org, repo=self.repo, path=path, sha=sha or self.root
+        )
+        r = requests.get(url, timeout=self.timeout, **self.kw)
+        if r.status_code == 404:
+            raise FileNotFoundError(path)
+        r.raise_for_status()
+        return MemoryFile(None, None, r.content)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/http.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/http.py
new file mode 100644
index 0000000000000000000000000000000000000000..4580764ce8495294bfa0ca44fcb60a4c2377f6b5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/http.py
@@ -0,0 +1,871 @@
+import asyncio
+import io
+import logging
+import re
+import weakref
+from copy import copy
+from urllib.parse import urlparse
+
+import aiohttp
+import yarl
+
+from fsspec.asyn import AbstractAsyncStreamedFile, AsyncFileSystem, sync, sync_wrapper
+from fsspec.callbacks import DEFAULT_CALLBACK
+from fsspec.exceptions import FSTimeoutError
+from fsspec.spec import AbstractBufferedFile
+from fsspec.utils import (
+    DEFAULT_BLOCK_SIZE,
+    glob_translate,
+    isfilelike,
+    nullcontext,
+    tokenize,
+)
+
+from ..caching import AllBytes
+
+# https://stackoverflow.com/a/15926317/3821154
+ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
+ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
+logger = logging.getLogger("fsspec.http")
+
+
+async def get_client(**kwargs):
+    return aiohttp.ClientSession(**kwargs)
+
+
+class HTTPFileSystem(AsyncFileSystem):
+    """
+    Simple File-System for fetching data via HTTP(S)
+
+    ``ls()`` is implemented by loading the parent page and doing a regex
+    match on the result. If simple_link=True, anything of the form
+    "http(s)://server.com/stuff?thing=other"; otherwise only links within
+    HTML href tags will be used.
+    """
+
+    sep = "/"
+
+    def __init__(
+        self,
+        simple_links=True,
+        block_size=None,
+        same_scheme=True,
+        size_policy=None,
+        cache_type="bytes",
+        cache_options=None,
+        asynchronous=False,
+        loop=None,
+        client_kwargs=None,
+        get_client=get_client,
+        encoded=False,
+        **storage_options,
+    ):
+        """
+        NB: if this is called async, you must await set_client
+
+        Parameters
+        ----------
+        block_size: int
+            Blocks to read bytes; if 0, will default to raw requests file-like
+            objects instead of HTTPFile instances
+        simple_links: bool
+            If True, will consider both HTML <a> tags and anything that looks
+            like a URL; if False, will consider only the former.
+        same_scheme: True
+            When doing ls/glob, if this is True, only consider paths that have
+            http/https matching the input URLs.
+        size_policy: this argument is deprecated
+        client_kwargs: dict
+            Passed to aiohttp.ClientSession, see
+            https://docs.aiohttp.org/en/stable/client_reference.html
+            For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
+        get_client: Callable[..., aiohttp.ClientSession]
+            A callable which takes keyword arguments and constructs
+            an aiohttp.ClientSession. It's state will be managed by
+            the HTTPFileSystem class.
+        storage_options: key-value
+            Any other parameters passed on to requests
+        cache_type, cache_options: defaults used in open
+        """
+        super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options)
+        self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
+        self.simple_links = simple_links
+        self.same_schema = same_scheme
+        self.cache_type = cache_type
+        self.cache_options = cache_options
+        self.client_kwargs = client_kwargs or {}
+        self.get_client = get_client
+        self.encoded = encoded
+        self.kwargs = storage_options
+        self._session = None
+
+        # Clean caching-related parameters from `storage_options`
+        # before propagating them as `request_options` through `self.kwargs`.
+        # TODO: Maybe rename `self.kwargs` to `self.request_options` to make
+        #       it clearer.
+        request_options = copy(storage_options)
+        self.use_listings_cache = request_options.pop("use_listings_cache", False)
+        request_options.pop("listings_expiry_time", None)
+        request_options.pop("max_paths", None)
+        request_options.pop("skip_instance_cache", None)
+        self.kwargs = request_options
+
+    @property
+    def fsid(self):
+        return "http"
+
+    def encode_url(self, url):
+        return yarl.URL(url, encoded=self.encoded)
+
+    @staticmethod
+    def close_session(loop, session):
+        if loop is not None and loop.is_running():
+            try:
+                sync(loop, session.close, timeout=0.1)
+                return
+            except (TimeoutError, FSTimeoutError, NotImplementedError):
+                pass
+        connector = getattr(session, "_connector", None)
+        if connector is not None:
+            # close after loop is dead
+            connector._close()
+
+    async def set_session(self):
+        if self._session is None:
+            self._session = await self.get_client(loop=self.loop, **self.client_kwargs)
+            if not self.asynchronous:
+                weakref.finalize(self, self.close_session, self.loop, self._session)
+        return self._session
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        """For HTTP, we always want to keep the full URL"""
+        return path
+
+    @classmethod
+    def _parent(cls, path):
+        # override, since _strip_protocol is different for URLs
+        par = super()._parent(path)
+        if len(par) > 7:  # "http://..."
+            return par
+        return ""
+
+    async def _ls_real(self, url, detail=True, **kwargs):
+        # ignoring URL-encoded arguments
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+        logger.debug(url)
+        session = await self.set_session()
+        async with session.get(self.encode_url(url), **self.kwargs) as r:
+            self._raise_not_found_for_status(r, url)
+            try:
+                text = await r.text()
+                if self.simple_links:
+                    links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
+                else:
+                    links = [u[2] for u in ex.findall(text)]
+            except UnicodeDecodeError:
+                links = []  # binary, not HTML
+        out = set()
+        parts = urlparse(url)
+        for l in links:
+            if isinstance(l, tuple):
+                l = l[1]
+            if l.startswith("/") and len(l) > 1:
+                # absolute URL on this server
+                l = f"{parts.scheme}://{parts.netloc}{l}"
+            if l.startswith("http"):
+                if self.same_schema and l.startswith(url.rstrip("/") + "/"):
+                    out.add(l)
+                elif l.replace("https", "http").startswith(
+                    url.replace("https", "http").rstrip("/") + "/"
+                ):
+                    # allowed to cross http <-> https
+                    out.add(l)
+            else:
+                if l not in ["..", "../"]:
+                    # Ignore FTP-like "parent"
+                    out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
+        if not out and url.endswith("/"):
+            out = await self._ls_real(url.rstrip("/"), detail=False)
+        if detail:
+            return [
+                {
+                    "name": u,
+                    "size": None,
+                    "type": "directory" if u.endswith("/") else "file",
+                }
+                for u in out
+            ]
+        else:
+            return sorted(out)
+
+    async def _ls(self, url, detail=True, **kwargs):
+        if self.use_listings_cache and url in self.dircache:
+            out = self.dircache[url]
+        else:
+            out = await self._ls_real(url, detail=detail, **kwargs)
+            self.dircache[url] = out
+        return out
+
+    ls = sync_wrapper(_ls)
+
+    def _raise_not_found_for_status(self, response, url):
+        """
+        Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
+        """
+        if response.status == 404:
+            raise FileNotFoundError(url)
+        response.raise_for_status()
+
+    async def _cat_file(self, url, start=None, end=None, **kwargs):
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+        logger.debug(url)
+
+        if start is not None or end is not None:
+            if start == end:
+                return b""
+            headers = kw.pop("headers", {}).copy()
+
+            headers["Range"] = await self._process_limits(url, start, end)
+            kw["headers"] = headers
+        session = await self.set_session()
+        async with session.get(self.encode_url(url), **kw) as r:
+            out = await r.read()
+            self._raise_not_found_for_status(r, url)
+        return out
+
+    async def _get_file(
+        self, rpath, lpath, chunk_size=5 * 2**20, callback=DEFAULT_CALLBACK, **kwargs
+    ):
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+        logger.debug(rpath)
+        session = await self.set_session()
+        async with session.get(self.encode_url(rpath), **kw) as r:
+            try:
+                size = int(r.headers["content-length"])
+            except (ValueError, KeyError):
+                size = None
+
+            callback.set_size(size)
+            self._raise_not_found_for_status(r, rpath)
+            if isfilelike(lpath):
+                outfile = lpath
+            else:
+                outfile = open(lpath, "wb")  # noqa: ASYNC101
+
+            try:
+                chunk = True
+                while chunk:
+                    chunk = await r.content.read(chunk_size)
+                    outfile.write(chunk)
+                    callback.relative_update(len(chunk))
+            finally:
+                if not isfilelike(lpath):
+                    outfile.close()
+
+    async def _put_file(
+        self,
+        lpath,
+        rpath,
+        chunk_size=5 * 2**20,
+        callback=DEFAULT_CALLBACK,
+        method="post",
+        **kwargs,
+    ):
+        async def gen_chunks():
+            # Support passing arbitrary file-like objects
+            # and use them instead of streams.
+            if isinstance(lpath, io.IOBase):
+                context = nullcontext(lpath)
+                use_seek = False  # might not support seeking
+            else:
+                context = open(lpath, "rb")  # noqa: ASYNC101
+                use_seek = True
+
+            with context as f:
+                if use_seek:
+                    callback.set_size(f.seek(0, 2))
+                    f.seek(0)
+                else:
+                    callback.set_size(getattr(f, "size", None))
+
+                chunk = f.read(chunk_size)
+                while chunk:
+                    yield chunk
+                    callback.relative_update(len(chunk))
+                    chunk = f.read(chunk_size)
+
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+        session = await self.set_session()
+
+        method = method.lower()
+        if method not in ("post", "put"):
+            raise ValueError(
+                f"method has to be either 'post' or 'put', not: {method!r}"
+            )
+
+        meth = getattr(session, method)
+        async with meth(self.encode_url(rpath), data=gen_chunks(), **kw) as resp:
+            self._raise_not_found_for_status(resp, rpath)
+
+    async def _exists(self, path, **kwargs):
+        kw = self.kwargs.copy()
+        kw.update(kwargs)
+        try:
+            logger.debug(path)
+            session = await self.set_session()
+            r = await session.get(self.encode_url(path), **kw)
+            async with r:
+                return r.status < 400
+        except aiohttp.ClientError:
+            return False
+
+    async def _isfile(self, path, **kwargs):
+        return await self._exists(path, **kwargs)
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=None,  # XXX: This differs from the base class.
+        cache_type=None,
+        cache_options=None,
+        size=None,
+        **kwargs,
+    ):
+        """Make a file-like object
+
+        Parameters
+        ----------
+        path: str
+            Full URL with protocol
+        mode: string
+            must be "rb"
+        block_size: int or None
+            Bytes to download in one request; use instance value if None. If
+            zero, will return a streaming Requests file-like instance.
+        kwargs: key-value
+            Any other parameters, passed to requests calls
+        """
+        if mode != "rb":
+            raise NotImplementedError
+        block_size = block_size if block_size is not None else self.block_size
+        kw = self.kwargs.copy()
+        kw["asynchronous"] = self.asynchronous
+        kw.update(kwargs)
+        size = size or self.info(path, **kwargs)["size"]
+        session = sync(self.loop, self.set_session)
+        if block_size and size:
+            return HTTPFile(
+                self,
+                path,
+                session=session,
+                block_size=block_size,
+                mode=mode,
+                size=size,
+                cache_type=cache_type or self.cache_type,
+                cache_options=cache_options or self.cache_options,
+                loop=self.loop,
+                **kw,
+            )
+        else:
+            return HTTPStreamFile(
+                self,
+                path,
+                mode=mode,
+                loop=self.loop,
+                session=session,
+                **kw,
+            )
+
+    async def open_async(self, path, mode="rb", size=None, **kwargs):
+        session = await self.set_session()
+        if size is None:
+            try:
+                size = (await self._info(path, **kwargs))["size"]
+            except FileNotFoundError:
+                pass
+        return AsyncStreamFile(
+            self,
+            path,
+            loop=self.loop,
+            session=session,
+            size=size,
+            **kwargs,
+        )
+
+    def ukey(self, url):
+        """Unique identifier; assume HTTP files are static, unchanging"""
+        return tokenize(url, self.kwargs, self.protocol)
+
+    async def _info(self, url, **kwargs):
+        """Get info of URL
+
+        Tries to access location via HEAD, and then GET methods, but does
+        not fetch the data.
+
+        It is possible that the server does not supply any size information, in
+        which case size will be given as None (and certain operations on the
+        corresponding file will not work).
+        """
+        info = {}
+        session = await self.set_session()
+
+        for policy in ["head", "get"]:
+            try:
+                info.update(
+                    await _file_info(
+                        self.encode_url(url),
+                        size_policy=policy,
+                        session=session,
+                        **self.kwargs,
+                        **kwargs,
+                    )
+                )
+                if info.get("size") is not None:
+                    break
+            except Exception as exc:
+                if policy == "get":
+                    # If get failed, then raise a FileNotFoundError
+                    raise FileNotFoundError(url) from exc
+                logger.debug("", exc_info=exc)
+
+        return {"name": url, "size": None, **info, "type": "file"}
+
+    async def _glob(self, path, maxdepth=None, **kwargs):
+        """
+        Find files by glob-matching.
+
+        This implementation is idntical to the one in AbstractFileSystem,
+        but "?" is not considered as a character for globbing, because it is
+        so common in URLs, often identifying the "query" part.
+        """
+        if maxdepth is not None and maxdepth < 1:
+            raise ValueError("maxdepth must be at least 1")
+        import re
+
+        ends_with_slash = path.endswith("/")  # _strip_protocol strips trailing slash
+        path = self._strip_protocol(path)
+        append_slash_to_dirname = ends_with_slash or path.endswith(("/**", "/*"))
+        idx_star = path.find("*") if path.find("*") >= 0 else len(path)
+        idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
+
+        min_idx = min(idx_star, idx_brace)
+
+        detail = kwargs.pop("detail", False)
+
+        if not has_magic(path):
+            if await self._exists(path, **kwargs):
+                if not detail:
+                    return [path]
+                else:
+                    return {path: await self._info(path, **kwargs)}
+            else:
+                if not detail:
+                    return []  # glob of non-existent returns empty
+                else:
+                    return {}
+        elif "/" in path[:min_idx]:
+            min_idx = path[:min_idx].rindex("/")
+            root = path[: min_idx + 1]
+            depth = path[min_idx + 1 :].count("/") + 1
+        else:
+            root = ""
+            depth = path[min_idx + 1 :].count("/") + 1
+
+        if "**" in path:
+            if maxdepth is not None:
+                idx_double_stars = path.find("**")
+                depth_double_stars = path[idx_double_stars:].count("/") + 1
+                depth = depth - depth_double_stars + maxdepth
+            else:
+                depth = None
+
+        allpaths = await self._find(
+            root, maxdepth=depth, withdirs=True, detail=True, **kwargs
+        )
+
+        pattern = glob_translate(path + ("/" if ends_with_slash else ""))
+        pattern = re.compile(pattern)
+
+        out = {
+            (
+                p.rstrip("/")
+                if not append_slash_to_dirname
+                and info["type"] == "directory"
+                and p.endswith("/")
+                else p
+            ): info
+            for p, info in sorted(allpaths.items())
+            if pattern.match(p.rstrip("/"))
+        }
+
+        if detail:
+            return out
+        else:
+            return list(out)
+
+    async def _isdir(self, path):
+        # override, since all URLs are (also) files
+        try:
+            return bool(await self._ls(path))
+        except (FileNotFoundError, ValueError):
+            return False
+
+
+class HTTPFile(AbstractBufferedFile):
+    """
+    A file-like object pointing to a remove HTTP(S) resource
+
+    Supports only reading, with read-ahead of a predermined block-size.
+
+    In the case that the server does not supply the filesize, only reading of
+    the complete file in one go is supported.
+
+    Parameters
+    ----------
+    url: str
+        Full URL of the remote resource, including the protocol
+    session: aiohttp.ClientSession or None
+        All calls will be made within this session, to avoid restarting
+        connections where the server allows this
+    block_size: int or None
+        The amount of read-ahead to do, in bytes. Default is 5MB, or the value
+        configured for the FileSystem creating this file
+    size: None or int
+        If given, this is the size of the file in bytes, and we don't attempt
+        to call the server to find the value.
+    kwargs: all other key-values are passed to requests calls.
+    """
+
+    def __init__(
+        self,
+        fs,
+        url,
+        session=None,
+        block_size=None,
+        mode="rb",
+        cache_type="bytes",
+        cache_options=None,
+        size=None,
+        loop=None,
+        asynchronous=False,
+        **kwargs,
+    ):
+        if mode != "rb":
+            raise NotImplementedError("File mode not supported")
+        self.asynchronous = asynchronous
+        self.url = url
+        self.session = session
+        self.details = {"name": url, "size": size, "type": "file"}
+        super().__init__(
+            fs=fs,
+            path=url,
+            mode=mode,
+            block_size=block_size,
+            cache_type=cache_type,
+            cache_options=cache_options,
+            **kwargs,
+        )
+        self.loop = loop
+
+    def read(self, length=-1):
+        """Read bytes from file
+
+        Parameters
+        ----------
+        length: int
+            Read up to this many bytes. If negative, read all content to end of
+            file. If the server has not supplied the filesize, attempting to
+            read only part of the data will raise a ValueError.
+        """
+        if (
+            (length < 0 and self.loc == 0)  # explicit read all
+            # but not when the size is known and fits into a block anyways
+            and not (self.size is not None and self.size <= self.blocksize)
+        ):
+            self._fetch_all()
+        if self.size is None:
+            if length < 0:
+                self._fetch_all()
+        else:
+            length = min(self.size - self.loc, length)
+        return super().read(length)
+
+    async def async_fetch_all(self):
+        """Read whole file in one shot, without caching
+
+        This is only called when position is still at zero,
+        and read() is called without a byte-count.
+        """
+        logger.debug(f"Fetch all for {self}")
+        if not isinstance(self.cache, AllBytes):
+            r = await self.session.get(self.fs.encode_url(self.url), **self.kwargs)
+            async with r:
+                r.raise_for_status()
+                out = await r.read()
+                self.cache = AllBytes(
+                    size=len(out), fetcher=None, blocksize=None, data=out
+                )
+                self.size = len(out)
+
+    _fetch_all = sync_wrapper(async_fetch_all)
+
+    def _parse_content_range(self, headers):
+        """Parse the Content-Range header"""
+        s = headers.get("Content-Range", "")
+        m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
+        if not m:
+            return None, None, None
+
+        if m[1] == "*":
+            start = end = None
+        else:
+            start, end = [int(x) for x in m[1].split("-")]
+        total = None if m[2] == "*" else int(m[2])
+        return start, end, total
+
+    async def async_fetch_range(self, start, end):
+        """Download a block of data
+
+        The expectation is that the server returns only the requested bytes,
+        with HTTP code 206. If this is not the case, we first check the headers,
+        and then stream the output - if the data size is bigger than we
+        requested, an exception is raised.
+        """
+        logger.debug(f"Fetch range for {self}: {start}-{end}")
+        kwargs = self.kwargs.copy()
+        headers = kwargs.pop("headers", {}).copy()
+        headers["Range"] = f"bytes={start}-{end - 1}"
+        logger.debug(f"{self.url} : {headers['Range']}")
+        r = await self.session.get(
+            self.fs.encode_url(self.url), headers=headers, **kwargs
+        )
+        async with r:
+            if r.status == 416:
+                # range request outside file
+                return b""
+            r.raise_for_status()
+
+            # If the server has handled the range request, it should reply
+            # with status 206 (partial content). But we'll guess that a suitable
+            # Content-Range header or a Content-Length no more than the
+            # requested range also mean we have got the desired range.
+            response_is_range = (
+                r.status == 206
+                or self._parse_content_range(r.headers)[0] == start
+                or int(r.headers.get("Content-Length", end + 1)) <= end - start
+            )
+
+            if response_is_range:
+                # partial content, as expected
+                out = await r.read()
+            elif start > 0:
+                raise ValueError(
+                    "The HTTP server doesn't appear to support range requests. "
+                    "Only reading this file from the beginning is supported. "
+                    "Open with block_size=0 for a streaming file interface."
+                )
+            else:
+                # Response is not a range, but we want the start of the file,
+                # so we can read the required amount anyway.
+                cl = 0
+                out = []
+                while True:
+                    chunk = await r.content.read(2**20)
+                    # data size unknown, let's read until we have enough
+                    if chunk:
+                        out.append(chunk)
+                        cl += len(chunk)
+                        if cl > end - start:
+                            break
+                    else:
+                        break
+                out = b"".join(out)[: end - start]
+            return out
+
+    _fetch_range = sync_wrapper(async_fetch_range)
+
+    def __reduce__(self):
+        return (
+            reopen,
+            (
+                self.fs,
+                self.url,
+                self.mode,
+                self.blocksize,
+                self.cache.name if self.cache else "none",
+                self.size,
+            ),
+        )
+
+
+def reopen(fs, url, mode, blocksize, cache_type, size=None):
+    return fs.open(
+        url, mode=mode, block_size=blocksize, cache_type=cache_type, size=size
+    )
+
+
+magic_check = re.compile("([*[])")
+
+
+def has_magic(s):
+    match = magic_check.search(s)
+    return match is not None
+
+
+class HTTPStreamFile(AbstractBufferedFile):
+    def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs):
+        self.asynchronous = kwargs.pop("asynchronous", False)
+        self.url = url
+        self.loop = loop
+        self.session = session
+        if mode != "rb":
+            raise ValueError
+        self.details = {"name": url, "size": None}
+        super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs)
+
+        async def cor():
+            r = await self.session.get(self.fs.encode_url(url), **kwargs).__aenter__()
+            self.fs._raise_not_found_for_status(r, url)
+            return r
+
+        self.r = sync(self.loop, cor)
+
+    def seek(self, loc, whence=0):
+        if loc == 0 and whence == 1:
+            return
+        if loc == self.loc and whence == 0:
+            return
+        raise ValueError("Cannot seek streaming HTTP file")
+
+    async def _read(self, num=-1):
+        out = await self.r.content.read(num)
+        self.loc += len(out)
+        return out
+
+    read = sync_wrapper(_read)
+
+    async def _close(self):
+        self.r.close()
+
+    def close(self):
+        asyncio.run_coroutine_threadsafe(self._close(), self.loop)
+        super().close()
+
+    def __reduce__(self):
+        return reopen, (self.fs, self.url, self.mode, self.blocksize, self.cache.name)
+
+
+class AsyncStreamFile(AbstractAsyncStreamedFile):
+    def __init__(
+        self, fs, url, mode="rb", loop=None, session=None, size=None, **kwargs
+    ):
+        self.url = url
+        self.session = session
+        self.r = None
+        if mode != "rb":
+            raise ValueError
+        self.details = {"name": url, "size": None}
+        self.kwargs = kwargs
+        super().__init__(fs=fs, path=url, mode=mode, cache_type="none")
+        self.size = size
+
+    async def read(self, num=-1):
+        if self.r is None:
+            r = await self.session.get(
+                self.fs.encode_url(self.url), **self.kwargs
+            ).__aenter__()
+            self.fs._raise_not_found_for_status(r, self.url)
+            self.r = r
+        out = await self.r.content.read(num)
+        self.loc += len(out)
+        return out
+
+    async def close(self):
+        if self.r is not None:
+            self.r.close()
+            self.r = None
+        await super().close()
+
+
+async def get_range(session, url, start, end, file=None, **kwargs):
+    # explicit get a range when we know it must be safe
+    kwargs = kwargs.copy()
+    headers = kwargs.pop("headers", {}).copy()
+    headers["Range"] = f"bytes={start}-{end - 1}"
+    r = await session.get(url, headers=headers, **kwargs)
+    r.raise_for_status()
+    async with r:
+        out = await r.read()
+    if file:
+        with open(file, "r+b") as f:  # noqa: ASYNC101
+            f.seek(start)
+            f.write(out)
+    else:
+        return out
+
+
+async def _file_info(url, session, size_policy="head", **kwargs):
+    """Call HEAD on the server to get details about the file (size/checksum etc.)
+
+    Default operation is to explicitly allow redirects and use encoding
+    'identity' (no compression) to get the true size of the target.
+    """
+    logger.debug("Retrieve file size for %s", url)
+    kwargs = kwargs.copy()
+    ar = kwargs.pop("allow_redirects", True)
+    head = kwargs.get("headers", {}).copy()
+    head["Accept-Encoding"] = "identity"
+    kwargs["headers"] = head
+
+    info = {}
+    if size_policy == "head":
+        r = await session.head(url, allow_redirects=ar, **kwargs)
+    elif size_policy == "get":
+        r = await session.get(url, allow_redirects=ar, **kwargs)
+    else:
+        raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
+    async with r:
+        r.raise_for_status()
+
+        # TODO:
+        #  recognise lack of 'Accept-Ranges',
+        #                 or 'Accept-Ranges': 'none' (not 'bytes')
+        #  to mean streaming only, no random access => return None
+        if "Content-Length" in r.headers:
+            # Some servers may choose to ignore Accept-Encoding and return
+            # compressed content, in which case the returned size is unreliable.
+            if "Content-Encoding" not in r.headers or r.headers["Content-Encoding"] in [
+                "identity",
+                "",
+            ]:
+                info["size"] = int(r.headers["Content-Length"])
+        elif "Content-Range" in r.headers:
+            info["size"] = int(r.headers["Content-Range"].split("/")[1])
+
+        if "Content-Type" in r.headers:
+            info["mimetype"] = r.headers["Content-Type"].partition(";")[0]
+
+        info["url"] = str(r.url)
+
+        for checksum_field in ["ETag", "Content-MD5", "Digest"]:
+            if r.headers.get(checksum_field):
+                info[checksum_field] = r.headers[checksum_field]
+
+    return info
+
+
+async def _file_size(url, session=None, *args, **kwargs):
+    if session is None:
+        session = await get_client()
+    info = await _file_info(url, session=session, *args, **kwargs)
+    return info.get("size")
+
+
+file_size = sync_wrapper(_file_size)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/jupyter.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/jupyter.py
new file mode 100644
index 0000000000000000000000000000000000000000..2839f4c1feea56dddd54bdc00f0b884c8461d29e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/jupyter.py
@@ -0,0 +1,124 @@
+import base64
+import io
+import re
+
+import requests
+
+import fsspec
+
+
+class JupyterFileSystem(fsspec.AbstractFileSystem):
+    """View of the files as seen by a Jupyter server (notebook or lab)"""
+
+    protocol = ("jupyter", "jlab")
+
+    def __init__(self, url, tok=None, **kwargs):
+        """
+
+        Parameters
+        ----------
+        url : str
+            Base URL of the server, like "http://127.0.0.1:8888". May include
+            token in the string, which is given by the process when starting up
+        tok : str
+            If the token is obtained separately, can be given here
+        kwargs
+        """
+        if "?" in url:
+            if tok is None:
+                try:
+                    tok = re.findall("token=([a-z0-9]+)", url)[0]
+                except IndexError as e:
+                    raise ValueError("Could not determine token") from e
+            url = url.split("?", 1)[0]
+        self.url = url.rstrip("/") + "/api/contents"
+        self.session = requests.Session()
+        if tok:
+            self.session.headers["Authorization"] = f"token {tok}"
+
+        super().__init__(**kwargs)
+
+    def ls(self, path, detail=True, **kwargs):
+        path = self._strip_protocol(path)
+        r = self.session.get(f"{self.url}/{path}")
+        if r.status_code == 404:
+            return FileNotFoundError(path)
+        r.raise_for_status()
+        out = r.json()
+
+        if out["type"] == "directory":
+            out = out["content"]
+        else:
+            out = [out]
+        for o in out:
+            o["name"] = o.pop("path")
+            o.pop("content")
+            if o["type"] == "notebook":
+                o["type"] = "file"
+        if detail:
+            return out
+        return [o["name"] for o in out]
+
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        path = self._strip_protocol(path)
+        r = self.session.get(f"{self.url}/{path}")
+        if r.status_code == 404:
+            return FileNotFoundError(path)
+        r.raise_for_status()
+        out = r.json()
+        if out["format"] == "text":
+            # data should be binary
+            b = out["content"].encode()
+        else:
+            b = base64.b64decode(out["content"])
+        return b[start:end]
+
+    def pipe_file(self, path, value, **_):
+        path = self._strip_protocol(path)
+        json = {
+            "name": path.rsplit("/", 1)[-1],
+            "path": path,
+            "size": len(value),
+            "content": base64.b64encode(value).decode(),
+            "format": "base64",
+            "type": "file",
+        }
+        self.session.put(f"{self.url}/{path}", json=json)
+
+    def mkdir(self, path, create_parents=True, **kwargs):
+        path = self._strip_protocol(path)
+        if create_parents and "/" in path:
+            self.mkdir(path.rsplit("/", 1)[0], True)
+        json = {
+            "name": path.rsplit("/", 1)[-1],
+            "path": path,
+            "size": None,
+            "content": None,
+            "type": "directory",
+        }
+        self.session.put(f"{self.url}/{path}", json=json)
+
+    def _rm(self, path):
+        path = self._strip_protocol(path)
+        self.session.delete(f"{self.url}/{path}")
+
+    def _open(self, path, mode="rb", **kwargs):
+        path = self._strip_protocol(path)
+        if mode == "rb":
+            data = self.cat_file(path)
+            return io.BytesIO(data)
+        else:
+            return SimpleFileWriter(self, path, mode="wb")
+
+
+class SimpleFileWriter(fsspec.spec.AbstractBufferedFile):
+    def _upload_chunk(self, final=False):
+        """Never uploads a chunk until file is done
+
+        Not suitable for large files
+        """
+        if final is False:
+            return False
+        self.buffer.seek(0)
+        data = self.buffer.read()
+        self.fs.pipe_file(self.path, data)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/libarchive.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/libarchive.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb6f145352e1989e0477e259be02d8d7f4d729e2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/libarchive.py
@@ -0,0 +1,213 @@
+from contextlib import contextmanager
+from ctypes import (
+    CFUNCTYPE,
+    POINTER,
+    c_int,
+    c_longlong,
+    c_void_p,
+    cast,
+    create_string_buffer,
+)
+
+import libarchive
+import libarchive.ffi as ffi
+
+from fsspec import open_files
+from fsspec.archive import AbstractArchiveFileSystem
+from fsspec.implementations.memory import MemoryFile
+from fsspec.utils import DEFAULT_BLOCK_SIZE
+
+# Libarchive requires seekable files or memory only for certain archive
+# types. However, since we read the directory first to cache the contents
+# and also allow random access to any file, the file-like object needs
+# to be seekable no matter what.
+
+# Seek call-backs (not provided in the libarchive python wrapper)
+SEEK_CALLBACK = CFUNCTYPE(c_longlong, c_int, c_void_p, c_longlong, c_int)
+read_set_seek_callback = ffi.ffi(
+    "read_set_seek_callback", [ffi.c_archive_p, SEEK_CALLBACK], c_int, ffi.check_int
+)
+new_api = hasattr(ffi, "NO_OPEN_CB")
+
+
+@contextmanager
+def custom_reader(file, format_name="all", filter_name="all", block_size=ffi.page_size):
+    """Read an archive from a seekable file-like object.
+
+    The `file` object must support the standard `readinto` and 'seek' methods.
+    """
+    buf = create_string_buffer(block_size)
+    buf_p = cast(buf, c_void_p)
+
+    def read_func(archive_p, context, ptrptr):
+        # readinto the buffer, returns number of bytes read
+        length = file.readinto(buf)
+        # write the address of the buffer into the pointer
+        ptrptr = cast(ptrptr, POINTER(c_void_p))
+        ptrptr[0] = buf_p
+        # tell libarchive how much data was written into the buffer
+        return length
+
+    def seek_func(archive_p, context, offset, whence):
+        file.seek(offset, whence)
+        # tell libarchvie the current position
+        return file.tell()
+
+    read_cb = ffi.READ_CALLBACK(read_func)
+    seek_cb = SEEK_CALLBACK(seek_func)
+
+    if new_api:
+        open_cb = ffi.NO_OPEN_CB
+        close_cb = ffi.NO_CLOSE_CB
+    else:
+        open_cb = libarchive.read.OPEN_CALLBACK(ffi.VOID_CB)
+        close_cb = libarchive.read.CLOSE_CALLBACK(ffi.VOID_CB)
+
+    with libarchive.read.new_archive_read(format_name, filter_name) as archive_p:
+        read_set_seek_callback(archive_p, seek_cb)
+        ffi.read_open(archive_p, None, open_cb, read_cb, close_cb)
+        yield libarchive.read.ArchiveRead(archive_p)
+
+
+class LibArchiveFileSystem(AbstractArchiveFileSystem):
+    """Compressed archives as a file-system (read-only)
+
+    Supports the following formats:
+    tar, pax , cpio, ISO9660, zip, mtree, shar, ar, raw, xar, lha/lzh, rar
+    Microsoft CAB, 7-Zip, WARC
+
+    See the libarchive documentation for further restrictions.
+    https://www.libarchive.org/
+
+    Keeps file object open while instance lives. It only works in seekable
+    file-like objects. In case the filesystem does not support this kind of
+    file object, it is recommended to cache locally.
+
+    This class is pickleable, but not necessarily thread-safe (depends on the
+    platform). See libarchive documentation for details.
+    """
+
+    root_marker = ""
+    protocol = "libarchive"
+    cachable = False
+
+    def __init__(
+        self,
+        fo="",
+        mode="r",
+        target_protocol=None,
+        target_options=None,
+        block_size=DEFAULT_BLOCK_SIZE,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        fo: str or file-like
+            Contains ZIP, and must exist. If a str, will fetch file using
+            :meth:`~fsspec.open_files`, which must return one file exactly.
+        mode: str
+            Currently, only 'r' accepted
+        target_protocol: str (optional)
+            If ``fo`` is a string, this value can be used to override the
+            FS protocol inferred from a URL
+        target_options: dict (optional)
+            Kwargs passed when instantiating the target FS, if ``fo`` is
+            a string.
+        """
+        super().__init__(self, **kwargs)
+        if mode != "r":
+            raise ValueError("Only read from archive files accepted")
+        if isinstance(fo, str):
+            files = open_files(fo, protocol=target_protocol, **(target_options or {}))
+            if len(files) != 1:
+                raise ValueError(
+                    f'Path "{fo}" did not resolve to exactly one file: "{files}"'
+                )
+            fo = files[0]
+        self.of = fo
+        self.fo = fo.__enter__()  # the whole instance is a context
+        self.block_size = block_size
+        self.dir_cache = None
+
+    @contextmanager
+    def _open_archive(self):
+        self.fo.seek(0)
+        with custom_reader(self.fo, block_size=self.block_size) as arc:
+            yield arc
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        # file paths are always relative to the archive root
+        return super()._strip_protocol(path).lstrip("/")
+
+    def _get_dirs(self):
+        fields = {
+            "name": "pathname",
+            "size": "size",
+            "created": "ctime",
+            "mode": "mode",
+            "uid": "uid",
+            "gid": "gid",
+            "mtime": "mtime",
+        }
+
+        if self.dir_cache is not None:
+            return
+
+        self.dir_cache = {}
+        list_names = []
+        with self._open_archive() as arc:
+            for entry in arc:
+                if not entry.isdir and not entry.isfile:
+                    # Skip symbolic links, fifo entries, etc.
+                    continue
+                self.dir_cache.update(
+                    {
+                        dirname: {"name": dirname, "size": 0, "type": "directory"}
+                        for dirname in self._all_dirnames(set(entry.name))
+                    }
+                )
+                f = {key: getattr(entry, fields[key]) for key in fields}
+                f["type"] = "directory" if entry.isdir else "file"
+                list_names.append(entry.name)
+
+                self.dir_cache[f["name"]] = f
+        # libarchive does not seem to return an entry for the directories (at least
+        # not in all formats), so get the directories names from the files names
+        self.dir_cache.update(
+            {
+                dirname: {"name": dirname, "size": 0, "type": "directory"}
+                for dirname in self._all_dirnames(list_names)
+            }
+        )
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        path = self._strip_protocol(path)
+        if mode != "rb":
+            raise NotImplementedError
+
+        data = bytes()
+        with self._open_archive() as arc:
+            for entry in arc:
+                if entry.pathname != path:
+                    continue
+
+                if entry.size == 0:
+                    # empty file, so there are no blocks
+                    break
+
+                for block in entry.get_blocks(entry.size):
+                    data = block
+                    break
+                else:
+                    raise ValueError
+        return MemoryFile(fs=self, path=path, data=data)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/local.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/local.py
new file mode 100644
index 0000000000000000000000000000000000000000..9881606f138f59ba88a4c2882ef7f7b5de06c122
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/local.py
@@ -0,0 +1,467 @@
+import datetime
+import io
+import logging
+import os
+import os.path as osp
+import shutil
+import stat
+import tempfile
+
+from fsspec import AbstractFileSystem
+from fsspec.compression import compr
+from fsspec.core import get_compression
+from fsspec.utils import isfilelike, stringify_path
+
+logger = logging.getLogger("fsspec.local")
+
+
+class LocalFileSystem(AbstractFileSystem):
+    """Interface to files on local storage
+
+    Parameters
+    ----------
+    auto_mkdir: bool
+        Whether, when opening a file, the directory containing it should
+        be created (if it doesn't already exist). This is assumed by pyarrow
+        code.
+    """
+
+    root_marker = "/"
+    protocol = "file", "local"
+    local_file = True
+
+    def __init__(self, auto_mkdir=False, **kwargs):
+        super().__init__(**kwargs)
+        self.auto_mkdir = auto_mkdir
+
+    @property
+    def fsid(self):
+        return "local"
+
+    def mkdir(self, path, create_parents=True, **kwargs):
+        path = self._strip_protocol(path)
+        if self.exists(path):
+            raise FileExistsError(path)
+        if create_parents:
+            self.makedirs(path, exist_ok=True)
+        else:
+            os.mkdir(path, **kwargs)
+
+    def makedirs(self, path, exist_ok=False):
+        path = self._strip_protocol(path)
+        os.makedirs(path, exist_ok=exist_ok)
+
+    def rmdir(self, path):
+        path = self._strip_protocol(path)
+        os.rmdir(path)
+
+    def ls(self, path, detail=False, **kwargs):
+        path = self._strip_protocol(path)
+        info = self.info(path)
+        if info["type"] == "directory":
+            with os.scandir(path) as it:
+                infos = [self.info(f) for f in it]
+        else:
+            infos = [info]
+
+        if not detail:
+            return [i["name"] for i in infos]
+        return infos
+
+    def info(self, path, **kwargs):
+        if isinstance(path, os.DirEntry):
+            # scandir DirEntry
+            out = path.stat(follow_symlinks=False)
+            link = path.is_symlink()
+            if path.is_dir(follow_symlinks=False):
+                t = "directory"
+            elif path.is_file(follow_symlinks=False):
+                t = "file"
+            else:
+                t = "other"
+            path = self._strip_protocol(path.path)
+        else:
+            # str or path-like
+            path = self._strip_protocol(path)
+            out = os.stat(path, follow_symlinks=False)
+            link = stat.S_ISLNK(out.st_mode)
+            if link:
+                out = os.stat(path, follow_symlinks=True)
+            if stat.S_ISDIR(out.st_mode):
+                t = "directory"
+            elif stat.S_ISREG(out.st_mode):
+                t = "file"
+            else:
+                t = "other"
+        result = {
+            "name": path,
+            "size": out.st_size,
+            "type": t,
+            "created": out.st_ctime,
+            "islink": link,
+        }
+        for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
+            result[field] = getattr(out, f"st_{field}")
+        if result["islink"]:
+            result["destination"] = os.readlink(path)
+            try:
+                out2 = os.stat(path, follow_symlinks=True)
+                result["size"] = out2.st_size
+            except OSError:
+                result["size"] = 0
+        return result
+
+    def lexists(self, path, **kwargs):
+        return osp.lexists(path)
+
+    def cp_file(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1)
+        path2 = self._strip_protocol(path2)
+        if self.auto_mkdir:
+            self.makedirs(self._parent(path2), exist_ok=True)
+        if self.isfile(path1):
+            shutil.copyfile(path1, path2)
+        elif self.isdir(path1):
+            self.mkdirs(path2, exist_ok=True)
+        else:
+            raise FileNotFoundError(path1)
+
+    def isfile(self, path):
+        path = self._strip_protocol(path)
+        return os.path.isfile(path)
+
+    def isdir(self, path):
+        path = self._strip_protocol(path)
+        return os.path.isdir(path)
+
+    def get_file(self, path1, path2, callback=None, **kwargs):
+        if isfilelike(path2):
+            with open(path1, "rb") as f:
+                shutil.copyfileobj(f, path2)
+        else:
+            return self.cp_file(path1, path2, **kwargs)
+
+    def put_file(self, path1, path2, callback=None, **kwargs):
+        return self.cp_file(path1, path2, **kwargs)
+
+    def mv(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1)
+        path2 = self._strip_protocol(path2)
+        shutil.move(path1, path2)
+
+    def link(self, src, dst, **kwargs):
+        src = self._strip_protocol(src)
+        dst = self._strip_protocol(dst)
+        os.link(src, dst, **kwargs)
+
+    def symlink(self, src, dst, **kwargs):
+        src = self._strip_protocol(src)
+        dst = self._strip_protocol(dst)
+        os.symlink(src, dst, **kwargs)
+
+    def islink(self, path) -> bool:
+        return os.path.islink(self._strip_protocol(path))
+
+    def rm_file(self, path):
+        os.remove(self._strip_protocol(path))
+
+    def rm(self, path, recursive=False, maxdepth=None):
+        if not isinstance(path, list):
+            path = [path]
+
+        for p in path:
+            p = self._strip_protocol(p)
+            if self.isdir(p):
+                if not recursive:
+                    raise ValueError("Cannot delete directory, set recursive=True")
+                if osp.abspath(p) == os.getcwd():
+                    raise ValueError("Cannot delete current working directory")
+                shutil.rmtree(p)
+            else:
+                os.remove(p)
+
+    def unstrip_protocol(self, name):
+        name = self._strip_protocol(name)  # normalise for local/win/...
+        return f"file://{name}"
+
+    def _open(self, path, mode="rb", block_size=None, **kwargs):
+        path = self._strip_protocol(path)
+        if self.auto_mkdir and "w" in mode:
+            self.makedirs(self._parent(path), exist_ok=True)
+        return LocalFileOpener(path, mode, fs=self, **kwargs)
+
+    def touch(self, path, truncate=True, **kwargs):
+        path = self._strip_protocol(path)
+        if self.auto_mkdir:
+            self.makedirs(self._parent(path), exist_ok=True)
+        if self.exists(path):
+            os.utime(path, None)
+        else:
+            open(path, "a").close()
+        if truncate:
+            os.truncate(path, 0)
+
+    def created(self, path):
+        info = self.info(path=path)
+        return datetime.datetime.fromtimestamp(
+            info["created"], tz=datetime.timezone.utc
+        )
+
+    def modified(self, path):
+        info = self.info(path=path)
+        return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
+
+    @classmethod
+    def _parent(cls, path):
+        path = cls._strip_protocol(path)
+        if os.sep == "/":
+            # posix native
+            return path.rsplit("/", 1)[0] or "/"
+        else:
+            # NT
+            path_ = path.rsplit("/", 1)[0]
+            if len(path_) <= 3:
+                if path_[1:2] == ":":
+                    # nt root (something like c:/)
+                    return path_[0] + ":/"
+            # More cases may be required here
+            return path_
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        path = stringify_path(path)
+        if path.startswith("file://"):
+            path = path[7:]
+        elif path.startswith("file:"):
+            path = path[5:]
+        elif path.startswith("local://"):
+            path = path[8:]
+        elif path.startswith("local:"):
+            path = path[6:]
+
+        path = make_path_posix(path)
+        if os.sep != "/":
+            # This code-path is a stripped down version of
+            # > drive, path = ntpath.splitdrive(path)
+            if path[1:2] == ":":
+                # Absolute drive-letter path, e.g. X:\Windows
+                # Relative path with drive, e.g. X:Windows
+                drive, path = path[:2], path[2:]
+            elif path[:2] == "//":
+                # UNC drives, e.g. \\server\share or \\?\UNC\server\share
+                # Device drives, e.g. \\.\device or \\?\device
+                if (index1 := path.find("/", 2)) == -1 or (
+                    index2 := path.find("/", index1 + 1)
+                ) == -1:
+                    drive, path = path, ""
+                else:
+                    drive, path = path[:index2], path[index2:]
+            else:
+                # Relative path, e.g. Windows
+                drive = ""
+
+            path = path.rstrip("/") or cls.root_marker
+            return drive + path
+
+        else:
+            return path.rstrip("/") or cls.root_marker
+
+    def _isfilestore(self):
+        # Inheriting from DaskFileSystem makes this False (S3, etc. were)
+        # the original motivation. But we are a posix-like file system.
+        # See https://github.com/dask/dask/issues/5526
+        return True
+
+    def chmod(self, path, mode):
+        path = stringify_path(path)
+        return os.chmod(path, mode)
+
+
+def make_path_posix(path):
+    """Make path generic and absolute for current OS"""
+    if not isinstance(path, str):
+        if isinstance(path, (list, set, tuple)):
+            return type(path)(make_path_posix(p) for p in path)
+        else:
+            path = stringify_path(path)
+            if not isinstance(path, str):
+                raise TypeError(f"could not convert {path!r} to string")
+    if os.sep == "/":
+        # Native posix
+        if path.startswith("/"):
+            # most common fast case for posix
+            return path
+        elif path.startswith("~"):
+            return osp.expanduser(path)
+        elif path.startswith("./"):
+            path = path[2:]
+        elif path == ".":
+            path = ""
+        return f"{os.getcwd()}/{path}"
+    else:
+        # NT handling
+        if path[0:1] == "/" and path[2:3] == ":":
+            # path is like "/c:/local/path"
+            path = path[1:]
+        if path[1:2] == ":":
+            # windows full path like "C:\\local\\path"
+            if len(path) <= 3:
+                # nt root (something like c:/)
+                return path[0] + ":/"
+            path = path.replace("\\", "/")
+            return path
+        elif path[0:1] == "~":
+            return make_path_posix(osp.expanduser(path))
+        elif path.startswith(("\\\\", "//")):
+            # windows UNC/DFS-style paths
+            return "//" + path[2:].replace("\\", "/")
+        elif path.startswith(("\\", "/")):
+            # windows relative path with root
+            path = path.replace("\\", "/")
+            return f"{osp.splitdrive(os.getcwd())[0]}{path}"
+        else:
+            path = path.replace("\\", "/")
+            if path.startswith("./"):
+                path = path[2:]
+            elif path == ".":
+                path = ""
+            return f"{make_path_posix(os.getcwd())}/{path}"
+
+
+def trailing_sep(path):
+    """Return True if the path ends with a path separator.
+
+    A forward slash is always considered a path separator, even on Operating
+    Systems that normally use a backslash.
+    """
+    # TODO: if all incoming paths were posix-compliant then separator would
+    # always be a forward slash, simplifying this function.
+    # See https://github.com/fsspec/filesystem_spec/pull/1250
+    return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
+
+
+class LocalFileOpener(io.IOBase):
+    def __init__(
+        self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
+    ):
+        logger.debug("open file: %s", path)
+        self.path = path
+        self.mode = mode
+        self.fs = fs
+        self.f = None
+        self.autocommit = autocommit
+        self.compression = get_compression(path, compression)
+        self.blocksize = io.DEFAULT_BUFFER_SIZE
+        self._open()
+
+    def _open(self):
+        if self.f is None or self.f.closed:
+            if self.autocommit or "w" not in self.mode:
+                self.f = open(self.path, mode=self.mode)
+                if self.compression:
+                    compress = compr[self.compression]
+                    self.f = compress(self.f, mode=self.mode)
+            else:
+                # TODO: check if path is writable?
+                i, name = tempfile.mkstemp()
+                os.close(i)  # we want normal open and normal buffered file
+                self.temp = name
+                self.f = open(name, mode=self.mode)
+            if "w" not in self.mode:
+                self.size = self.f.seek(0, 2)
+                self.f.seek(0)
+                self.f.size = self.size
+
+    def _fetch_range(self, start, end):
+        # probably only used by cached FS
+        if "r" not in self.mode:
+            raise ValueError
+        self._open()
+        self.f.seek(start)
+        return self.f.read(end - start)
+
+    def __setstate__(self, state):
+        self.f = None
+        loc = state.pop("loc", None)
+        self.__dict__.update(state)
+        if "r" in state["mode"]:
+            self.f = None
+            self._open()
+            self.f.seek(loc)
+
+    def __getstate__(self):
+        d = self.__dict__.copy()
+        d.pop("f")
+        if "r" in self.mode:
+            d["loc"] = self.f.tell()
+        else:
+            if not self.f.closed:
+                raise ValueError("Cannot serialise open write-mode local file")
+        return d
+
+    def commit(self):
+        if self.autocommit:
+            raise RuntimeError("Can only commit if not already set to autocommit")
+        shutil.move(self.temp, self.path)
+
+    def discard(self):
+        if self.autocommit:
+            raise RuntimeError("Cannot discard if set to autocommit")
+        os.remove(self.temp)
+
+    def readable(self) -> bool:
+        return True
+
+    def writable(self) -> bool:
+        return "r" not in self.mode
+
+    def read(self, *args, **kwargs):
+        return self.f.read(*args, **kwargs)
+
+    def write(self, *args, **kwargs):
+        return self.f.write(*args, **kwargs)
+
+    def tell(self, *args, **kwargs):
+        return self.f.tell(*args, **kwargs)
+
+    def seek(self, *args, **kwargs):
+        return self.f.seek(*args, **kwargs)
+
+    def seekable(self, *args, **kwargs):
+        return self.f.seekable(*args, **kwargs)
+
+    def readline(self, *args, **kwargs):
+        return self.f.readline(*args, **kwargs)
+
+    def readlines(self, *args, **kwargs):
+        return self.f.readlines(*args, **kwargs)
+
+    def close(self):
+        return self.f.close()
+
+    def truncate(self, size=None) -> int:
+        return self.f.truncate(size)
+
+    @property
+    def closed(self):
+        return self.f.closed
+
+    def fileno(self):
+        return self.raw.fileno()
+
+    def flush(self) -> None:
+        self.f.flush()
+
+    def __iter__(self):
+        return self.f.__iter__()
+
+    def __getattr__(self, item):
+        return getattr(self.f, item)
+
+    def __enter__(self):
+        self._incontext = True
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._incontext = False
+        self.f.__exit__(exc_type, exc_value, traceback)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/memory.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..83e7e74d6ceceaf6e75268923094bfdf56b72fc7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/memory.py
@@ -0,0 +1,303 @@
+from __future__ import annotations
+
+import logging
+from datetime import datetime, timezone
+from errno import ENOTEMPTY
+from io import BytesIO
+from pathlib import PurePath, PureWindowsPath
+from typing import Any, ClassVar
+
+from fsspec import AbstractFileSystem
+from fsspec.implementations.local import LocalFileSystem
+from fsspec.utils import stringify_path
+
+logger = logging.getLogger("fsspec.memoryfs")
+
+
+class MemoryFileSystem(AbstractFileSystem):
+    """A filesystem based on a dict of BytesIO objects
+
+    This is a global filesystem so instances of this class all point to the same
+    in memory filesystem.
+    """
+
+    store: ClassVar[dict[str, Any]] = {}  # global, do not overwrite!
+    pseudo_dirs = [""]  # global, do not overwrite!
+    protocol = "memory"
+    root_marker = "/"
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        if isinstance(path, PurePath):
+            if isinstance(path, PureWindowsPath):
+                return LocalFileSystem._strip_protocol(path)
+            else:
+                path = stringify_path(path)
+
+        if path.startswith("memory://"):
+            path = path[len("memory://") :]
+        if "::" in path or "://" in path:
+            return path.rstrip("/")
+        path = path.lstrip("/").rstrip("/")
+        return "/" + path if path else ""
+
+    def ls(self, path, detail=True, **kwargs):
+        path = self._strip_protocol(path)
+        if path in self.store:
+            # there is a key with this exact name
+            if not detail:
+                return [path]
+            return [
+                {
+                    "name": path,
+                    "size": self.store[path].size,
+                    "type": "file",
+                    "created": self.store[path].created.timestamp(),
+                }
+            ]
+        paths = set()
+        starter = path + "/"
+        out = []
+        for p2 in tuple(self.store):
+            if p2.startswith(starter):
+                if "/" not in p2[len(starter) :]:
+                    # exact child
+                    out.append(
+                        {
+                            "name": p2,
+                            "size": self.store[p2].size,
+                            "type": "file",
+                            "created": self.store[p2].created.timestamp(),
+                        }
+                    )
+                elif len(p2) > len(starter):
+                    # implied child directory
+                    ppath = starter + p2[len(starter) :].split("/", 1)[0]
+                    if ppath not in paths:
+                        out = out or []
+                        out.append(
+                            {
+                                "name": ppath,
+                                "size": 0,
+                                "type": "directory",
+                            }
+                        )
+                        paths.add(ppath)
+        for p2 in self.pseudo_dirs:
+            if p2.startswith(starter):
+                if "/" not in p2[len(starter) :]:
+                    # exact child pdir
+                    if p2 not in paths:
+                        out.append({"name": p2, "size": 0, "type": "directory"})
+                        paths.add(p2)
+                else:
+                    # directory implied by deeper pdir
+                    ppath = starter + p2[len(starter) :].split("/", 1)[0]
+                    if ppath not in paths:
+                        out.append({"name": ppath, "size": 0, "type": "directory"})
+                        paths.add(ppath)
+        if not out:
+            if path in self.pseudo_dirs:
+                # empty dir
+                return []
+            raise FileNotFoundError(path)
+        if detail:
+            return out
+        return sorted([f["name"] for f in out])
+
+    def mkdir(self, path, create_parents=True, **kwargs):
+        path = self._strip_protocol(path)
+        if path in self.store or path in self.pseudo_dirs:
+            raise FileExistsError(path)
+        if self._parent(path).strip("/") and self.isfile(self._parent(path)):
+            raise NotADirectoryError(self._parent(path))
+        if create_parents and self._parent(path).strip("/"):
+            try:
+                self.mkdir(self._parent(path), create_parents, **kwargs)
+            except FileExistsError:
+                pass
+        if path and path not in self.pseudo_dirs:
+            self.pseudo_dirs.append(path)
+
+    def makedirs(self, path, exist_ok=False):
+        try:
+            self.mkdir(path, create_parents=True)
+        except FileExistsError:
+            if not exist_ok:
+                raise
+
+    def pipe_file(self, path, value, **kwargs):
+        """Set the bytes of given file
+
+        Avoids copies of the data if possible
+        """
+        self.open(path, "wb", data=value)
+
+    def rmdir(self, path):
+        path = self._strip_protocol(path)
+        if path == "":
+            # silently avoid deleting FS root
+            return
+        if path in self.pseudo_dirs:
+            if not self.ls(path):
+                self.pseudo_dirs.remove(path)
+            else:
+                raise OSError(ENOTEMPTY, "Directory not empty", path)
+        else:
+            raise FileNotFoundError(path)
+
+    def info(self, path, **kwargs):
+        logger.debug("info: %s", path)
+        path = self._strip_protocol(path)
+        if path in self.pseudo_dirs or any(
+            p.startswith(path + "/") for p in list(self.store) + self.pseudo_dirs
+        ):
+            return {
+                "name": path,
+                "size": 0,
+                "type": "directory",
+            }
+        elif path in self.store:
+            filelike = self.store[path]
+            return {
+                "name": path,
+                "size": filelike.size,
+                "type": "file",
+                "created": getattr(filelike, "created", None),
+            }
+        else:
+            raise FileNotFoundError(path)
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        path = self._strip_protocol(path)
+        if path in self.pseudo_dirs:
+            raise IsADirectoryError(path)
+        parent = path
+        while len(parent) > 1:
+            parent = self._parent(parent)
+            if self.isfile(parent):
+                raise FileExistsError(parent)
+        if mode in ["rb", "ab", "r+b"]:
+            if path in self.store:
+                f = self.store[path]
+                if mode == "ab":
+                    # position at the end of file
+                    f.seek(0, 2)
+                else:
+                    # position at the beginning of file
+                    f.seek(0)
+                return f
+            else:
+                raise FileNotFoundError(path)
+        elif mode == "wb":
+            m = MemoryFile(self, path, kwargs.get("data"))
+            if not self._intrans:
+                m.commit()
+            return m
+        else:
+            name = self.__class__.__name__
+            raise ValueError(f"unsupported file mode for {name}: {mode!r}")
+
+    def cp_file(self, path1, path2, **kwargs):
+        path1 = self._strip_protocol(path1)
+        path2 = self._strip_protocol(path2)
+        if self.isfile(path1):
+            self.store[path2] = MemoryFile(
+                self, path2, self.store[path1].getvalue()
+            )  # implicit copy
+        elif self.isdir(path1):
+            if path2 not in self.pseudo_dirs:
+                self.pseudo_dirs.append(path2)
+        else:
+            raise FileNotFoundError(path1)
+
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        logger.debug("cat: %s", path)
+        path = self._strip_protocol(path)
+        try:
+            return bytes(self.store[path].getbuffer()[start:end])
+        except KeyError:
+            raise FileNotFoundError(path)
+
+    def _rm(self, path):
+        path = self._strip_protocol(path)
+        try:
+            del self.store[path]
+        except KeyError as e:
+            raise FileNotFoundError(path) from e
+
+    def modified(self, path):
+        path = self._strip_protocol(path)
+        try:
+            return self.store[path].modified
+        except KeyError:
+            raise FileNotFoundError(path)
+
+    def created(self, path):
+        path = self._strip_protocol(path)
+        try:
+            return self.store[path].created
+        except KeyError:
+            raise FileNotFoundError(path)
+
+    def rm(self, path, recursive=False, maxdepth=None):
+        if isinstance(path, str):
+            path = self._strip_protocol(path)
+        else:
+            path = [self._strip_protocol(p) for p in path]
+        paths = self.expand_path(path, recursive=recursive, maxdepth=maxdepth)
+        for p in reversed(paths):
+            # If the expanded path doesn't exist, it is only because the expanded
+            # path was a directory that does not exist in self.pseudo_dirs. This
+            # is possible if you directly create files without making the
+            # directories first.
+            if not self.exists(p):
+                continue
+            if self.isfile(p):
+                self.rm_file(p)
+            else:
+                self.rmdir(p)
+
+
+class MemoryFile(BytesIO):
+    """A BytesIO which can't close and works as a context manager
+
+    Can initialise with data. Each path should only be active once at any moment.
+
+    No need to provide fs, path if auto-committing (default)
+    """
+
+    def __init__(self, fs=None, path=None, data=None):
+        logger.debug("open file %s", path)
+        self.fs = fs
+        self.path = path
+        self.created = datetime.now(tz=timezone.utc)
+        self.modified = datetime.now(tz=timezone.utc)
+        if data:
+            super().__init__(data)
+            self.seek(0)
+
+    @property
+    def size(self):
+        return self.getbuffer().nbytes
+
+    def __enter__(self):
+        return self
+
+    def close(self):
+        pass
+
+    def discard(self):
+        pass
+
+    def commit(self):
+        self.fs.store[self.path] = self
+        self.modified = datetime.now(tz=timezone.utc)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/reference.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/reference.py
new file mode 100644
index 0000000000000000000000000000000000000000..e202c96d6bcb000144710115156c1c4c78935bc7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/reference.py
@@ -0,0 +1,1160 @@
+import base64
+import collections
+import io
+import itertools
+import logging
+import math
+import os
+from functools import lru_cache
+from typing import TYPE_CHECKING
+
+import fsspec.core
+
+try:
+    import ujson as json
+except ImportError:
+    if not TYPE_CHECKING:
+        import json
+
+from ..asyn import AsyncFileSystem
+from ..callbacks import DEFAULT_CALLBACK
+from ..core import filesystem, open, split_protocol
+from ..utils import isfilelike, merge_offset_ranges, other_paths
+
+logger = logging.getLogger("fsspec.reference")
+
+
+class ReferenceNotReachable(RuntimeError):
+    def __init__(self, reference, target, *args):
+        super().__init__(*args)
+        self.reference = reference
+        self.target = target
+
+    def __str__(self):
+        return f'Reference "{self.reference}" failed to fetch target {self.target}'
+
+
+def _first(d):
+    return list(d.values())[0]
+
+
+def _prot_in_references(path, references):
+    ref = references.get(path)
+    if isinstance(ref, (list, tuple)):
+        return split_protocol(ref[0])[0] if ref[0] else ref[0]
+
+
+def _protocol_groups(paths, references):
+    if isinstance(paths, str):
+        return {_prot_in_references(paths, references): [paths]}
+    out = {}
+    for path in paths:
+        protocol = _prot_in_references(path, references)
+        out.setdefault(protocol, []).append(path)
+    return out
+
+
+class RefsValuesView(collections.abc.ValuesView):
+    def __iter__(self):
+        for val in self._mapping.zmetadata.values():
+            yield json.dumps(val).encode()
+        yield from self._mapping._items.values()
+        for field in self._mapping.listdir():
+            chunk_sizes = self._mapping._get_chunk_sizes(field)
+            if len(chunk_sizes) == 0:
+                yield self._mapping[field + "/0"]
+                continue
+            yield from self._mapping._generate_all_records(field)
+
+
+class RefsItemsView(collections.abc.ItemsView):
+    def __iter__(self):
+        return zip(self._mapping.keys(), self._mapping.values())
+
+
+def ravel_multi_index(idx, sizes):
+    val = 0
+    mult = 1
+    for i, s in zip(idx[::-1], sizes[::-1]):
+        val += i * mult
+        mult *= s
+    return val
+
+
+class LazyReferenceMapper(collections.abc.MutableMapping):
+    """This interface can be used to read/write references from Parquet stores.
+    It is not intended for other types of references.
+    It can be used with Kerchunk's MultiZarrToZarr method to combine
+    references into a parquet store.
+    Examples of this use-case can be found here:
+    https://fsspec.github.io/kerchunk/advanced.html?highlight=parquet#parquet-storage"""
+
+    # import is class level to prevent numpy dep requirement for fsspec
+    @property
+    def np(self):
+        import numpy as np
+
+        return np
+
+    @property
+    def pd(self):
+        import pandas as pd
+
+        return pd
+
+    def __init__(
+        self, root, fs=None, out_root=None, cache_size=128, categorical_threshold=10
+    ):
+        """
+
+        This instance will be writable, storing changes in memory until full partitions
+        are accumulated or .flush() is called.
+
+        To create an empty lazy store, use .create()
+
+        Parameters
+        ----------
+        root : str
+            Root of parquet store
+        fs : fsspec.AbstractFileSystem
+            fsspec filesystem object, default is local filesystem.
+        cache_size : int, default=128
+            Maximum size of LRU cache, where cache_size*record_size denotes
+            the total number of references that can be loaded in memory at once.
+        categorical_threshold : int
+            Encode urls as pandas.Categorical to reduce memory footprint if the ratio
+            of the number of unique urls to total number of refs for each variable
+            is greater than or equal to this number. (default 10)
+        """
+        self.root = root
+        self.chunk_sizes = {}
+        self.out_root = out_root or self.root
+        self.cat_thresh = categorical_threshold
+        self.cache_size = cache_size
+        self.dirs = None
+        self.url = self.root + "/{field}/refs.{record}.parq"
+        # TODO: derive fs from `root`
+        self.fs = fsspec.filesystem("file") if fs is None else fs
+
+    def __getattr__(self, item):
+        if item in ("_items", "record_size", "zmetadata"):
+            self.setup()
+            # avoid possible recursion if setup fails somehow
+            return self.__dict__[item]
+        raise AttributeError(item)
+
+    def setup(self):
+        self._items = {}
+        self._items[".zmetadata"] = self.fs.cat_file(
+            "/".join([self.root, ".zmetadata"])
+        )
+        met = json.loads(self._items[".zmetadata"])
+        self.record_size = met["record_size"]
+        self.zmetadata = met["metadata"]
+
+        # Define function to open and decompress refs
+        @lru_cache(maxsize=self.cache_size)
+        def open_refs(field, record):
+            """cached parquet file loader"""
+            path = self.url.format(field=field, record=record)
+            data = io.BytesIO(self.fs.cat_file(path))
+            df = self.pd.read_parquet(data, engine="fastparquet")
+            refs = {c: df[c].values for c in df.columns}
+            return refs
+
+        self.open_refs = open_refs
+
+    @staticmethod
+    def create(root, storage_options=None, fs=None, record_size=10000, **kwargs):
+        """Make empty parquet reference set
+
+        First deletes the contents of the given directory, if it exists.
+
+        Parameters
+        ----------
+        root: str
+            Directory to contain the output; will be created
+        storage_options: dict | None
+            For making the filesystem to use for writing is fs is None
+        fs: FileSystem | None
+            Filesystem for writing
+        record_size: int
+            Number of references per parquet file
+        kwargs: passed to __init__
+
+        Returns
+        -------
+        LazyReferenceMapper instance
+        """
+        met = {"metadata": {}, "record_size": record_size}
+        if fs is None:
+            fs, root = fsspec.core.url_to_fs(root, **(storage_options or {}))
+        if fs.exists(root):
+            fs.rm(root, recursive=True)
+        fs.makedirs(root, exist_ok=True)
+        fs.pipe("/".join([root, ".zmetadata"]), json.dumps(met).encode())
+        return LazyReferenceMapper(root, fs, **kwargs)
+
+    def listdir(self, basename=True):
+        """List top-level directories"""
+        # cache me?
+        if self.dirs is None:
+            dirs = [p.split("/", 1)[0] for p in self.zmetadata]
+            self.dirs = {p for p in dirs if p and not p.startswith(".")}
+        listing = self.dirs
+        if basename:
+            listing = [os.path.basename(path) for path in listing]
+        return listing
+
+    def ls(self, path="", detail=True):
+        """Shortcut file listings"""
+        if not path:
+            dirnames = self.listdir()
+            others = set(
+                [".zmetadata"]
+                + [name for name in self.zmetadata if "/" not in name]
+                + [name for name in self._items if "/" not in name]
+            )
+            if detail is False:
+                others.update(dirnames)
+                return sorted(others)
+            dirinfo = [
+                {"name": name, "type": "directory", "size": 0} for name in dirnames
+            ]
+            fileinfo = [
+                {
+                    "name": name,
+                    "type": "file",
+                    "size": len(
+                        json.dumps(self.zmetadata[name])
+                        if name in self.zmetadata
+                        else self._items[name]
+                    ),
+                }
+                for name in others
+            ]
+            return sorted(dirinfo + fileinfo, key=lambda s: s["name"])
+        parts = path.split("/", 1)
+        if len(parts) > 1:
+            raise FileNotFoundError("Cannot list within directories right now")
+        field = parts[0]
+        others = set(
+            [name for name in self.zmetadata if name.startswith(f"{path}/")]
+            + [name for name in self._items if name.startswith(f"{path}/")]
+        )
+        fileinfo = [
+            {
+                "name": name,
+                "type": "file",
+                "size": len(
+                    json.dumps(self.zmetadata[name])
+                    if name in self.zmetadata
+                    else self._items[name]
+                ),
+            }
+            for name in others
+        ]
+        keys = self._keys_in_field(field)
+
+        if detail is False:
+            return list(others) + list(keys)
+        recs = self._generate_all_records(field)
+        recinfo = [
+            {"name": name, "type": "file", "size": rec[-1]}
+            for name, rec in zip(keys, recs)
+            if rec[0]  # filters out path==None, deleted/missing
+        ]
+        return fileinfo + recinfo
+
+    def _load_one_key(self, key):
+        """Get the reference for one key
+
+        Returns bytes, one-element list or three-element list.
+        """
+        if key in self._items:
+            return self._items[key]
+        elif key in self.zmetadata:
+            return json.dumps(self.zmetadata[key]).encode()
+        elif "/" not in key or self._is_meta(key):
+            raise KeyError(key)
+        field, _ = key.rsplit("/", 1)
+        record, ri, chunk_size = self._key_to_record(key)
+        maybe = self._items.get((field, record), {}).get(ri, False)
+        if maybe is None:
+            # explicitly deleted
+            raise KeyError
+        elif maybe:
+            return maybe
+        elif chunk_size == 0:
+            return b""
+
+        # Chunk keys can be loaded from row group and cached in LRU cache
+        try:
+            refs = self.open_refs(field, record)
+        except (ValueError, TypeError, FileNotFoundError):
+            raise KeyError(key)
+        columns = ["path", "offset", "size", "raw"]
+        selection = [refs[c][ri] if c in refs else None for c in columns]
+        raw = selection[-1]
+        if raw is not None:
+            return raw
+        if selection[0] is None:
+            raise KeyError("This reference does not exist or has been deleted")
+        if selection[1:3] == [0, 0]:
+            # URL only
+            return selection[:1]
+        # URL, offset, size
+        return selection[:3]
+
+    @lru_cache(4096)
+    def _key_to_record(self, key):
+        """Details needed to construct a reference for one key"""
+        field, chunk = key.rsplit("/", 1)
+        chunk_sizes = self._get_chunk_sizes(field)
+        if len(chunk_sizes) == 0:
+            return 0, 0, 0
+        chunk_idx = [int(c) for c in chunk.split(".")]
+        chunk_number = ravel_multi_index(chunk_idx, chunk_sizes)
+        record = chunk_number // self.record_size
+        ri = chunk_number % self.record_size
+        return record, ri, len(chunk_sizes)
+
+    def _get_chunk_sizes(self, field):
+        """The number of chunks along each axis for a given field"""
+        if field not in self.chunk_sizes:
+            zarray = self.zmetadata[f"{field}/.zarray"]
+            size_ratio = [
+                math.ceil(s / c) for s, c in zip(zarray["shape"], zarray["chunks"])
+            ]
+            self.chunk_sizes[field] = size_ratio or [1]
+        return self.chunk_sizes[field]
+
+    def _generate_record(self, field, record):
+        """The references for a given parquet file of a given field"""
+        refs = self.open_refs(field, record)
+        it = iter(zip(*refs.values()))
+        if len(refs) == 3:
+            # All urls
+            return (list(t) for t in it)
+        elif len(refs) == 1:
+            # All raws
+            return refs["raw"]
+        else:
+            # Mix of urls and raws
+            return (list(t[:3]) if not t[3] else t[3] for t in it)
+
+    def _generate_all_records(self, field):
+        """Load all the references within a field by iterating over the parquet files"""
+        nrec = 1
+        for ch in self._get_chunk_sizes(field):
+            nrec *= ch
+        nrec = math.ceil(nrec / self.record_size)
+        for record in range(nrec):
+            yield from self._generate_record(field, record)
+
+    def values(self):
+        return RefsValuesView(self)
+
+    def items(self):
+        return RefsItemsView(self)
+
+    def __hash__(self):
+        return id(self)
+
+    def __getitem__(self, key):
+        return self._load_one_key(key)
+
+    def __setitem__(self, key, value):
+        if "/" in key and not self._is_meta(key):
+            field, chunk = key.rsplit("/", 1)
+            record, i, _ = self._key_to_record(key)
+            subdict = self._items.setdefault((field, record), {})
+            subdict[i] = value
+            if len(subdict) == self.record_size:
+                self.write(field, record)
+        else:
+            # metadata or top-level
+            self._items[key] = value
+            new_value = json.loads(
+                value.decode() if isinstance(value, bytes) else value
+            )
+            self.zmetadata[key] = {**self.zmetadata.get(key, {}), **new_value}
+
+    @staticmethod
+    def _is_meta(key):
+        return key.startswith(".z") or "/.z" in key
+
+    def __delitem__(self, key):
+        if key in self._items:
+            del self._items[key]
+        elif key in self.zmetadata:
+            del self.zmetadata[key]
+        else:
+            if "/" in key and not self._is_meta(key):
+                field, _ = key.rsplit("/", 1)
+                record, i, _ = self._key_to_record(key)
+                subdict = self._items.setdefault((field, record), {})
+                subdict[i] = None
+                if len(subdict) == self.record_size:
+                    self.write(field, record)
+            else:
+                # metadata or top-level
+                self._items[key] = None
+
+    def write(self, field, record, base_url=None, storage_options=None):
+        # extra requirements if writing
+        import kerchunk.df
+        import numpy as np
+        import pandas as pd
+
+        partition = self._items[(field, record)]
+        original = False
+        if len(partition) < self.record_size:
+            try:
+                original = self.open_refs(field, record)
+            except IOError:
+                pass
+
+        if original:
+            paths = original["path"]
+            offsets = original["offset"]
+            sizes = original["size"]
+            raws = original["raw"]
+        else:
+            paths = np.full(self.record_size, np.nan, dtype="O")
+            offsets = np.zeros(self.record_size, dtype="int64")
+            sizes = np.zeros(self.record_size, dtype="int64")
+            raws = np.full(self.record_size, np.nan, dtype="O")
+        for j, data in partition.items():
+            if isinstance(data, list):
+                if (
+                    str(paths.dtype) == "category"
+                    and data[0] not in paths.dtype.categories
+                ):
+                    paths = paths.add_categories(data[0])
+                paths[j] = data[0]
+                if len(data) > 1:
+                    offsets[j] = data[1]
+                    sizes[j] = data[2]
+            elif data is None:
+                # delete
+                paths[j] = None
+                offsets[j] = 0
+                sizes[j] = 0
+                raws[j] = None
+            else:
+                # this is the only call into kerchunk, could remove
+                raws[j] = kerchunk.df._proc_raw(data)
+        # TODO: only save needed columns
+        df = pd.DataFrame(
+            {
+                "path": paths,
+                "offset": offsets,
+                "size": sizes,
+                "raw": raws,
+            },
+            copy=False,
+        )
+        if df.path.count() / (df.path.nunique() or 1) > self.cat_thresh:
+            df["path"] = df["path"].astype("category")
+        object_encoding = {"raw": "bytes", "path": "utf8"}
+        has_nulls = ["path", "raw"]
+
+        fn = f"{base_url or self.out_root}/{field}/refs.{record}.parq"
+        self.fs.mkdirs(f"{base_url or self.out_root}/{field}", exist_ok=True)
+        df.to_parquet(
+            fn,
+            engine="fastparquet",
+            storage_options=storage_options
+            or getattr(self.fs, "storage_options", None),
+            compression="zstd",
+            index=False,
+            stats=False,
+            object_encoding=object_encoding,
+            has_nulls=has_nulls,
+            # **kwargs,
+        )
+        partition.clear()
+        self._items.pop((field, record))
+
+    def flush(self, base_url=None, storage_options=None):
+        """Output any modified or deleted keys
+
+        Parameters
+        ----------
+        base_url: str
+            Location of the output
+        """
+        # write what we have so far and clear sub chunks
+        for thing in list(self._items):
+            if isinstance(thing, tuple):
+                field, record = thing
+                self.write(
+                    field,
+                    record,
+                    base_url=base_url,
+                    storage_options=storage_options,
+                )
+
+        # gather .zmetadata from self._items and write that too
+        for k in list(self._items):
+            if k != ".zmetadata" and ".z" in k:
+                self.zmetadata[k] = json.loads(self._items.pop(k))
+        met = {"metadata": self.zmetadata, "record_size": self.record_size}
+        self._items[".zmetadata"] = json.dumps(met).encode()
+        self.fs.pipe(
+            "/".join([base_url or self.out_root, ".zmetadata"]),
+            self._items[".zmetadata"],
+        )
+
+        # TODO: only clear those that we wrote to?
+        self.open_refs.cache_clear()
+
+    def __len__(self):
+        # Caveat: This counts expected references, not actual - but is fast
+        count = 0
+        for field in self.listdir():
+            if field.startswith("."):
+                count += 1
+            else:
+                count += math.prod(self._get_chunk_sizes(field))
+        count += len(self.zmetadata)  # all metadata keys
+        # any other files not in reference partitions
+        count += sum(1 for _ in self._items if not isinstance(_, tuple))
+        return count
+
+    def __iter__(self):
+        # Caveat: returns only existing keys, so the number of these does not
+        #  match len(self)
+        metas = set(self.zmetadata)
+        metas.update(self._items)
+        for bit in metas:
+            if isinstance(bit, str):
+                yield bit
+        for field in self.listdir():
+            for k in self._keys_in_field(field):
+                if k in self:
+                    yield k
+
+    def __contains__(self, item):
+        try:
+            self._load_one_key(item)
+            return True
+        except KeyError:
+            return False
+
+    def _keys_in_field(self, field):
+        """List key names in given field
+
+        Produces strings like "field/x.y" appropriate from the chunking of the array
+        """
+        chunk_sizes = self._get_chunk_sizes(field)
+        if len(chunk_sizes) == 0:
+            yield field + "/0"
+            return
+        inds = itertools.product(*(range(i) for i in chunk_sizes))
+        for ind in inds:
+            yield field + "/" + ".".join([str(c) for c in ind])
+
+
+class ReferenceFileSystem(AsyncFileSystem):
+    """View byte ranges of some other file as a file system
+    Initial version: single file system target, which must support
+    async, and must allow start and end args in _cat_file. Later versions
+    may allow multiple arbitrary URLs for the targets.
+    This FileSystem is read-only. It is designed to be used with async
+    targets (for now). This FileSystem only allows whole-file access, no
+    ``open``. We do not get original file details from the target FS.
+    Configuration is by passing a dict of references at init, or a URL to
+    a JSON file containing the same; this dict
+    can also contain concrete data for some set of paths.
+    Reference dict format:
+    {path0: bytes_data, path1: (target_url, offset, size)}
+    https://github.com/fsspec/kerchunk/blob/main/README.md
+    """
+
+    protocol = "reference"
+
+    def __init__(
+        self,
+        fo,
+        target=None,
+        ref_storage_args=None,
+        target_protocol=None,
+        target_options=None,
+        remote_protocol=None,
+        remote_options=None,
+        fs=None,
+        template_overrides=None,
+        simple_templates=True,
+        max_gap=64_000,
+        max_block=256_000_000,
+        cache_size=128,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        fo : dict or str
+            The set of references to use for this instance, with a structure as above.
+            If str referencing a JSON file, will use fsspec.open, in conjunction
+            with target_options and target_protocol to open and parse JSON at this
+            location. If a directory, then assume references are a set of parquet
+            files to be loaded lazily.
+        target : str
+            For any references having target_url as None, this is the default file
+            target to use
+        ref_storage_args : dict
+            If references is a str, use these kwargs for loading the JSON file.
+            Deprecated: use target_options instead.
+        target_protocol : str
+            Used for loading the reference file, if it is a path. If None, protocol
+            will be derived from the given path
+        target_options : dict
+            Extra FS options for loading the reference file ``fo``, if given as a path
+        remote_protocol : str
+            The protocol of the filesystem on which the references will be evaluated
+            (unless fs is provided). If not given, will be derived from the first
+            URL that has a protocol in the templates or in the references, in that
+            order.
+        remote_options : dict
+            kwargs to go with remote_protocol
+        fs : AbstractFileSystem | dict(str, (AbstractFileSystem | dict))
+            Directly provide a file system(s):
+                - a single filesystem instance
+                - a dict of protocol:filesystem, where each value is either a filesystem
+                  instance, or a dict of kwargs that can be used to create in
+                  instance for the given protocol
+
+            If this is given, remote_options and remote_protocol are ignored.
+        template_overrides : dict
+            Swap out any templates in the references file with these - useful for
+            testing.
+        simple_templates: bool
+            Whether templates can be processed with simple replace (True) or if
+            jinja  is needed (False, much slower). All reference sets produced by
+            ``kerchunk`` are simple in this sense, but the spec allows for complex.
+        max_gap, max_block: int
+            For merging multiple concurrent requests to the same remote file.
+            Neighboring byte ranges will only be merged when their
+            inter-range gap is <= ``max_gap``. Default is 64KB. Set to 0
+            to only merge when it requires no extra bytes. Pass a negative
+            number to disable merging, appropriate for local target files.
+            Neighboring byte ranges will only be merged when the size of
+            the aggregated range is <= ``max_block``. Default is 256MB.
+        cache_size : int
+            Maximum size of LRU cache, where cache_size*record_size denotes
+            the total number of references that can be loaded in memory at once.
+            Only used for lazily loaded references.
+        kwargs : passed to parent class
+        """
+        super().__init__(**kwargs)
+        self.target = target
+        self.template_overrides = template_overrides
+        self.simple_templates = simple_templates
+        self.templates = {}
+        self.fss = {}
+        self._dircache = {}
+        self.max_gap = max_gap
+        self.max_block = max_block
+        if isinstance(fo, str):
+            dic = dict(
+                **(ref_storage_args or target_options or {}), protocol=target_protocol
+            )
+            ref_fs, fo2 = fsspec.core.url_to_fs(fo, **dic)
+            if ref_fs.isfile(fo2):
+                # text JSON
+                with fsspec.open(fo, "rb", **dic) as f:
+                    logger.info("Read reference from URL %s", fo)
+                    text = json.load(f)
+                self._process_references(text, template_overrides)
+            else:
+                # Lazy parquet refs
+                logger.info("Open lazy reference dict from URL %s", fo)
+                self.references = LazyReferenceMapper(
+                    fo2,
+                    fs=ref_fs,
+                    cache_size=cache_size,
+                )
+        else:
+            # dictionaries
+            self._process_references(fo, template_overrides)
+        if isinstance(fs, dict):
+            self.fss = {
+                k: (
+                    fsspec.filesystem(k.split(":", 1)[0], **opts)
+                    if isinstance(opts, dict)
+                    else opts
+                )
+                for k, opts in fs.items()
+            }
+            if None not in self.fss:
+                self.fss[None] = filesystem("file")
+            return
+        if fs is not None:
+            # single remote FS
+            remote_protocol = (
+                fs.protocol[0] if isinstance(fs.protocol, tuple) else fs.protocol
+            )
+            self.fss[remote_protocol] = fs
+
+        if remote_protocol is None:
+            # get single protocol from any templates
+            for ref in self.templates.values():
+                if callable(ref):
+                    ref = ref()
+                protocol, _ = fsspec.core.split_protocol(ref)
+                if protocol and protocol not in self.fss:
+                    fs = filesystem(protocol, **(remote_options or {}))
+                    self.fss[protocol] = fs
+        if remote_protocol is None:
+            # get single protocol from references
+            # TODO: warning here, since this can be very expensive?
+            for ref in self.references.values():
+                if callable(ref):
+                    ref = ref()
+                if isinstance(ref, list) and ref[0]:
+                    protocol, _ = fsspec.core.split_protocol(ref[0])
+                    if protocol not in self.fss:
+                        fs = filesystem(protocol, **(remote_options or {}))
+                        self.fss[protocol] = fs
+                        # only use first remote URL
+                        break
+
+        if remote_protocol and remote_protocol not in self.fss:
+            fs = filesystem(remote_protocol, **(remote_options or {}))
+            self.fss[remote_protocol] = fs
+
+        self.fss[None] = fs or filesystem("file")  # default one
+
+    def _cat_common(self, path, start=None, end=None):
+        path = self._strip_protocol(path)
+        logger.debug(f"cat: {path}")
+        try:
+            part = self.references[path]
+        except KeyError:
+            raise FileNotFoundError(path)
+        if isinstance(part, str):
+            part = part.encode()
+        if isinstance(part, bytes):
+            logger.debug(f"Reference: {path}, type bytes")
+            if part.startswith(b"base64:"):
+                part = base64.b64decode(part[7:])
+            return part, None, None
+
+        if len(part) == 1:
+            logger.debug(f"Reference: {path}, whole file => {part}")
+            url = part[0]
+            start1, end1 = start, end
+        else:
+            url, start0, size = part
+            logger.debug(f"Reference: {path} => {url}, offset {start0}, size {size}")
+            end0 = start0 + size
+
+            if start is not None:
+                if start >= 0:
+                    start1 = start0 + start
+                else:
+                    start1 = end0 + start
+            else:
+                start1 = start0
+            if end is not None:
+                if end >= 0:
+                    end1 = start0 + end
+                else:
+                    end1 = end0 + end
+            else:
+                end1 = end0
+        if url is None:
+            url = self.target
+        return url, start1, end1
+
+    async def _cat_file(self, path, start=None, end=None, **kwargs):
+        part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
+        if isinstance(part_or_url, bytes):
+            return part_or_url[start:end]
+        protocol, _ = split_protocol(part_or_url)
+        try:
+            await self.fss[protocol]._cat_file(part_or_url, start=start, end=end)
+        except Exception as e:
+            raise ReferenceNotReachable(path, part_or_url) from e
+
+    def cat_file(self, path, start=None, end=None, **kwargs):
+        part_or_url, start0, end0 = self._cat_common(path, start=start, end=end)
+        if isinstance(part_or_url, bytes):
+            return part_or_url[start:end]
+        protocol, _ = split_protocol(part_or_url)
+        try:
+            return self.fss[protocol].cat_file(part_or_url, start=start0, end=end0)
+        except Exception as e:
+            raise ReferenceNotReachable(path, part_or_url) from e
+
+    def pipe_file(self, path, value, **_):
+        """Temporarily add binary data or reference as a file"""
+        self.references[path] = value
+
+    async def _get_file(self, rpath, lpath, **kwargs):
+        if self.isdir(rpath):
+            return os.makedirs(lpath, exist_ok=True)
+        data = await self._cat_file(rpath)
+        with open(lpath, "wb") as f:
+            f.write(data)
+
+    def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, **kwargs):
+        if self.isdir(rpath):
+            return os.makedirs(lpath, exist_ok=True)
+        data = self.cat_file(rpath, **kwargs)
+        callback.set_size(len(data))
+        if isfilelike(lpath):
+            lpath.write(data)
+        else:
+            with open(lpath, "wb") as f:
+                f.write(data)
+        callback.absolute_update(len(data))
+
+    def get(self, rpath, lpath, recursive=False, **kwargs):
+        if recursive:
+            # trigger directory build
+            self.ls("")
+        rpath = self.expand_path(rpath, recursive=recursive)
+        fs = fsspec.filesystem("file", auto_mkdir=True)
+        targets = other_paths(rpath, lpath)
+        if recursive:
+            data = self.cat([r for r in rpath if not self.isdir(r)])
+        else:
+            data = self.cat(rpath)
+        for remote, local in zip(rpath, targets):
+            if remote in data:
+                fs.pipe_file(local, data[remote])
+
+    def cat(self, path, recursive=False, on_error="raise", **kwargs):
+        if isinstance(path, str) and recursive:
+            raise NotImplementedError
+        if isinstance(path, list) and (recursive or any("*" in p for p in path)):
+            raise NotImplementedError
+        # TODO: if references is lazy, pre-fetch all paths in batch before access
+        proto_dict = _protocol_groups(path, self.references)
+        out = {}
+        for proto, paths in proto_dict.items():
+            fs = self.fss[proto]
+            urls, starts, ends, valid_paths = [], [], [], []
+            for p in paths:
+                # find references or label not-found. Early exit if any not
+                # found and on_error is "raise"
+                try:
+                    u, s, e = self._cat_common(p)
+                except FileNotFoundError as err:
+                    if on_error == "raise":
+                        raise
+                    if on_error != "omit":
+                        out[p] = err
+                else:
+                    urls.append(u)
+                    starts.append(s)
+                    ends.append(e)
+                    valid_paths.append(p)
+
+            # process references into form for merging
+            urls2 = []
+            starts2 = []
+            ends2 = []
+            paths2 = []
+            whole_files = set()
+            for u, s, e, p in zip(urls, starts, ends, valid_paths):
+                if isinstance(u, bytes):
+                    # data
+                    out[p] = u
+                elif s is None:
+                    # whole file - limits are None, None, but no further
+                    # entries take for this file
+                    whole_files.add(u)
+                    urls2.append(u)
+                    starts2.append(s)
+                    ends2.append(e)
+                    paths2.append(p)
+            for u, s, e, p in zip(urls, starts, ends, valid_paths):
+                # second run to account for files that are to be loaded whole
+                if s is not None and u not in whole_files:
+                    urls2.append(u)
+                    starts2.append(s)
+                    ends2.append(e)
+                    paths2.append(p)
+
+            # merge and fetch consolidated ranges
+            new_paths, new_starts, new_ends = merge_offset_ranges(
+                list(urls2),
+                list(starts2),
+                list(ends2),
+                sort=True,
+                max_gap=self.max_gap,
+                max_block=self.max_block,
+            )
+            bytes_out = fs.cat_ranges(new_paths, new_starts, new_ends)
+
+            # unbundle from merged bytes - simple approach
+            for u, s, e, p in zip(urls, starts, ends, valid_paths):
+                if p in out:
+                    continue  # was bytes, already handled
+                for np, ns, ne, b in zip(new_paths, new_starts, new_ends, bytes_out):
+                    if np == u and (ns is None or ne is None):
+                        if isinstance(b, Exception):
+                            out[p] = b
+                        else:
+                            out[p] = b[s:e]
+                    elif np == u and s >= ns and e <= ne:
+                        if isinstance(b, Exception):
+                            out[p] = b
+                        else:
+                            out[p] = b[s - ns : (e - ne) or None]
+
+        for k, v in out.copy().items():
+            # these were valid references, but fetch failed, so transform exc
+            if isinstance(v, Exception) and k in self.references:
+                ex = out[k]
+                new_ex = ReferenceNotReachable(k, self.references[k])
+                new_ex.__cause__ = ex
+                if on_error == "raise":
+                    raise new_ex
+                elif on_error != "omit":
+                    out[k] = new_ex
+
+        if len(out) == 1 and isinstance(path, str) and "*" not in path:
+            return _first(out)
+        return out
+
+    def _process_references(self, references, template_overrides=None):
+        vers = references.get("version", None)
+        if vers is None:
+            self._process_references0(references)
+        elif vers == 1:
+            self._process_references1(references, template_overrides=template_overrides)
+        else:
+            raise ValueError(f"Unknown reference spec version: {vers}")
+        # TODO: we make dircache by iterating over all entries, but for Spec >= 1,
+        #  can replace with programmatic. Is it even needed for mapper interface?
+
+    def _process_references0(self, references):
+        """Make reference dict for Spec Version 0"""
+        self.references = references
+
+    def _process_references1(self, references, template_overrides=None):
+        if not self.simple_templates or self.templates:
+            import jinja2
+        self.references = {}
+        self._process_templates(references.get("templates", {}))
+
+        @lru_cache(1000)
+        def _render_jinja(u):
+            return jinja2.Template(u).render(**self.templates)
+
+        for k, v in references.get("refs", {}).items():
+            if isinstance(v, str):
+                if v.startswith("base64:"):
+                    self.references[k] = base64.b64decode(v[7:])
+                self.references[k] = v
+            elif self.templates:
+                u = v[0]
+                if "{{" in u:
+                    if self.simple_templates:
+                        u = (
+                            u.replace("{{", "{")
+                            .replace("}}", "}")
+                            .format(**self.templates)
+                        )
+                    else:
+                        u = _render_jinja(u)
+                self.references[k] = [u] if len(v) == 1 else [u, v[1], v[2]]
+            else:
+                self.references[k] = v
+        self.references.update(self._process_gen(references.get("gen", [])))
+
+    def _process_templates(self, tmp):
+        self.templates = {}
+        if self.template_overrides is not None:
+            tmp.update(self.template_overrides)
+        for k, v in tmp.items():
+            if "{{" in v:
+                import jinja2
+
+                self.templates[k] = lambda temp=v, **kwargs: jinja2.Template(
+                    temp
+                ).render(**kwargs)
+            else:
+                self.templates[k] = v
+
+    def _process_gen(self, gens):
+        out = {}
+        for gen in gens:
+            dimension = {
+                k: v
+                if isinstance(v, list)
+                else range(v.get("start", 0), v["stop"], v.get("step", 1))
+                for k, v in gen["dimensions"].items()
+            }
+            products = (
+                dict(zip(dimension.keys(), values))
+                for values in itertools.product(*dimension.values())
+            )
+            for pr in products:
+                import jinja2
+
+                key = jinja2.Template(gen["key"]).render(**pr, **self.templates)
+                url = jinja2.Template(gen["url"]).render(**pr, **self.templates)
+                if ("offset" in gen) and ("length" in gen):
+                    offset = int(
+                        jinja2.Template(gen["offset"]).render(**pr, **self.templates)
+                    )
+                    length = int(
+                        jinja2.Template(gen["length"]).render(**pr, **self.templates)
+                    )
+                    out[key] = [url, offset, length]
+                elif ("offset" in gen) ^ ("length" in gen):
+                    raise ValueError(
+                        "Both 'offset' and 'length' are required for a "
+                        "reference generator entry if either is provided."
+                    )
+                else:
+                    out[key] = [url]
+        return out
+
+    def _dircache_from_items(self):
+        self.dircache = {"": []}
+        it = self.references.items()
+        for path, part in it:
+            if isinstance(part, (bytes, str)):
+                size = len(part)
+            elif len(part) == 1:
+                size = None
+            else:
+                _, _, size = part
+            par = path.rsplit("/", 1)[0] if "/" in path else ""
+            par0 = par
+            subdirs = [par0]
+            while par0 and par0 not in self.dircache:
+                # collect parent directories
+                par0 = self._parent(par0)
+                subdirs.append(par0)
+
+            subdirs.reverse()
+            for parent, child in zip(subdirs, subdirs[1:]):
+                # register newly discovered directories
+                assert child not in self.dircache
+                assert parent in self.dircache
+                self.dircache[parent].append(
+                    {"name": child, "type": "directory", "size": 0}
+                )
+                self.dircache[child] = []
+
+            self.dircache[par].append({"name": path, "type": "file", "size": size})
+
+    def _open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs):
+        data = self.cat_file(path)  # load whole chunk into memory
+        return io.BytesIO(data)
+
+    def ls(self, path, detail=True, **kwargs):
+        path = self._strip_protocol(path)
+        if isinstance(self.references, LazyReferenceMapper):
+            try:
+                return self.references.ls(path, detail)
+            except KeyError:
+                pass
+            raise FileNotFoundError(f"'{path}' is not a known key")
+        if not self.dircache:
+            self._dircache_from_items()
+        out = self._ls_from_cache(path)
+        if out is None:
+            raise FileNotFoundError(path)
+        if detail:
+            return out
+        return [o["name"] for o in out]
+
+    def exists(self, path, **kwargs):  # overwrite auto-sync version
+        return self.isdir(path) or self.isfile(path)
+
+    def isdir(self, path):  # overwrite auto-sync version
+        if self.dircache:
+            return path in self.dircache
+        elif isinstance(self.references, LazyReferenceMapper):
+            return path in self.references.listdir("")
+        else:
+            # this may be faster than building dircache for single calls, but
+            # by looping will be slow for many calls; could cache it?
+            return any(_.startswith(f"{path}/") for _ in self.references)
+
+    def isfile(self, path):  # overwrite auto-sync version
+        return path in self.references
+
+    async def _ls(self, path, detail=True, **kwargs):  # calls fast sync code
+        return self.ls(path, detail, **kwargs)
+
+    def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs):
+        if withdirs:
+            return super().find(
+                path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs
+            )
+        if path:
+            path = self._strip_protocol(path)
+            r = sorted(k for k in self.references if k.startswith(path))
+        else:
+            r = sorted(self.references)
+        if detail:
+            if not self.dircache:
+                self._dircache_from_items()
+            return {k: self._ls_from_cache(k)[0] for k in r}
+        else:
+            return r
+
+    def info(self, path, **kwargs):
+        out = self.references.get(path)
+        if out is not None:
+            if isinstance(out, (str, bytes)):
+                # decode base64 here
+                return {"name": path, "type": "file", "size": len(out)}
+            elif len(out) > 1:
+                return {"name": path, "type": "file", "size": out[2]}
+            else:
+                out0 = [{"name": path, "type": "file", "size": None}]
+        else:
+            out = self.ls(path, True)
+            out0 = [o for o in out if o["name"] == path]
+            if not out0:
+                return {"name": path, "type": "directory", "size": 0}
+        if out0[0]["size"] is None:
+            # if this is a whole remote file, update size using remote FS
+            prot, _ = split_protocol(self.references[path][0])
+            out0[0]["size"] = self.fss[prot].size(self.references[path][0])
+        return out0[0]
+
+    async def _info(self, path, **kwargs):  # calls fast sync code
+        return self.info(path)
+
+    async def _rm_file(self, path, **kwargs):
+        self.references.pop(
+            path, None
+        )  # ignores FileNotFound, just as well for directories
+        self.dircache.clear()  # this is a bit heavy handed
+
+    async def _pipe_file(self, path, data):
+        # can be str or bytes
+        self.references[path] = data
+        self.dircache.clear()  # this is a bit heavy handed
+
+    async def _put_file(self, lpath, rpath, **kwargs):
+        # puts binary
+        with open(lpath, "rb") as f:
+            self.references[rpath] = f.read()
+        self.dircache.clear()  # this is a bit heavy handed
+
+    def save_json(self, url, **storage_options):
+        """Write modified references into new location"""
+        out = {}
+        for k, v in self.references.items():
+            if isinstance(v, bytes):
+                try:
+                    out[k] = v.decode("ascii")
+                except UnicodeDecodeError:
+                    out[k] = (b"base64:" + base64.b64encode(v)).decode()
+            else:
+                out[k] = v
+        with fsspec.open(url, "wb", **storage_options) as f:
+            f.write(json.dumps({"version": 1, "refs": out}).encode())
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/sftp.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/sftp.py
new file mode 100644
index 0000000000000000000000000000000000000000..77f7b370cd246f9a9bfd34141afc3edd728d13c3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/sftp.py
@@ -0,0 +1,180 @@
+import datetime
+import logging
+import os
+import types
+import uuid
+from stat import S_ISDIR, S_ISLNK
+
+import paramiko
+
+from .. import AbstractFileSystem
+from ..utils import infer_storage_options
+
+logger = logging.getLogger("fsspec.sftp")
+
+
+class SFTPFileSystem(AbstractFileSystem):
+    """Files over SFTP/SSH
+
+    Peer-to-peer filesystem over SSH using paramiko.
+
+    Note: if using this with the ``open`` or ``open_files``, with full URLs,
+    there is no way to tell if a path is relative, so all paths are assumed
+    to be absolute.
+    """
+
+    protocol = "sftp", "ssh"
+
+    def __init__(self, host, **ssh_kwargs):
+        """
+
+        Parameters
+        ----------
+        host: str
+            Hostname or IP as a string
+        temppath: str
+            Location on the server to put files, when within a transaction
+        ssh_kwargs: dict
+            Parameters passed on to connection. See details in
+            https://docs.paramiko.org/en/3.3/api/client.html#paramiko.client.SSHClient.connect
+            May include port, username, password...
+        """
+        if self._cached:
+            return
+        super().__init__(**ssh_kwargs)
+        self.temppath = ssh_kwargs.pop("temppath", "/tmp")  # remote temp directory
+        self.host = host
+        self.ssh_kwargs = ssh_kwargs
+        self._connect()
+
+    def _connect(self):
+        logger.debug("Connecting to SFTP server %s", self.host)
+        self.client = paramiko.SSHClient()
+        self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        self.client.connect(self.host, **self.ssh_kwargs)
+        self.ftp = self.client.open_sftp()
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        return infer_storage_options(path)["path"]
+
+    @staticmethod
+    def _get_kwargs_from_urls(urlpath):
+        out = infer_storage_options(urlpath)
+        out.pop("path", None)
+        out.pop("protocol", None)
+        return out
+
+    def mkdir(self, path, create_parents=True, mode=511):
+        logger.debug("Creating folder %s", path)
+        if self.exists(path):
+            raise FileExistsError(f"File exists: {path}")
+
+        if create_parents:
+            self.makedirs(path)
+        else:
+            self.ftp.mkdir(path, mode)
+
+    def makedirs(self, path, exist_ok=False, mode=511):
+        if self.exists(path) and not exist_ok:
+            raise FileExistsError(f"File exists: {path}")
+
+        parts = path.split("/")
+        new_path = "/" if path[:1] == "/" else ""
+
+        for part in parts:
+            if part:
+                new_path = f"{new_path}/{part}" if new_path else part
+                if not self.exists(new_path):
+                    self.ftp.mkdir(new_path, mode)
+
+    def rmdir(self, path):
+        logger.debug("Removing folder %s", path)
+        self.ftp.rmdir(path)
+
+    def info(self, path):
+        stat = self._decode_stat(self.ftp.stat(path))
+        stat["name"] = path
+        return stat
+
+    @staticmethod
+    def _decode_stat(stat, parent_path=None):
+        if S_ISDIR(stat.st_mode):
+            t = "directory"
+        elif S_ISLNK(stat.st_mode):
+            t = "link"
+        else:
+            t = "file"
+        out = {
+            "name": "",
+            "size": stat.st_size,
+            "type": t,
+            "uid": stat.st_uid,
+            "gid": stat.st_gid,
+            "time": datetime.datetime.fromtimestamp(
+                stat.st_atime, tz=datetime.timezone.utc
+            ),
+            "mtime": datetime.datetime.fromtimestamp(
+                stat.st_mtime, tz=datetime.timezone.utc
+            ),
+        }
+        if parent_path:
+            out["name"] = "/".join([parent_path.rstrip("/"), stat.filename])
+        return out
+
+    def ls(self, path, detail=False):
+        logger.debug("Listing folder %s", path)
+        stats = [self._decode_stat(stat, path) for stat in self.ftp.listdir_iter(path)]
+        if detail:
+            return stats
+        else:
+            paths = [stat["name"] for stat in stats]
+            return sorted(paths)
+
+    def put(self, lpath, rpath, callback=None, **kwargs):
+        logger.debug("Put file %s into %s", lpath, rpath)
+        self.ftp.put(lpath, rpath)
+
+    def get_file(self, rpath, lpath, **kwargs):
+        if self.isdir(rpath):
+            os.makedirs(lpath, exist_ok=True)
+        else:
+            self.ftp.get(self._strip_protocol(rpath), lpath)
+
+    def _open(self, path, mode="rb", block_size=None, **kwargs):
+        """
+        block_size: int or None
+            If 0, no buffering, if 1, line buffering, if >1, buffer that many
+            bytes, if None use default from paramiko.
+        """
+        logger.debug("Opening file %s", path)
+        if kwargs.get("autocommit", True) is False:
+            # writes to temporary file, move on commit
+            path2 = "/".join([self.temppath, str(uuid.uuid4())])
+            f = self.ftp.open(path2, mode, bufsize=block_size if block_size else -1)
+            f.temppath = path2
+            f.targetpath = path
+            f.fs = self
+            f.commit = types.MethodType(commit_a_file, f)
+            f.discard = types.MethodType(discard_a_file, f)
+        else:
+            f = self.ftp.open(path, mode, bufsize=block_size if block_size else -1)
+        return f
+
+    def _rm(self, path):
+        if self.isdir(path):
+            self.ftp.rmdir(path)
+        else:
+            self.ftp.remove(path)
+
+    def mv(self, old, new):
+        logger.debug("Renaming %s into %s", old, new)
+        self.ftp.posix_rename(old, new)
+
+
+def commit_a_file(self):
+    self.fs.mv(self.temppath, self.targetpath)
+
+
+def discard_a_file(self):
+    self.fs._rm(self.temppath)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/smb.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/smb.py
new file mode 100644
index 0000000000000000000000000000000000000000..867a581582cee1e03b50654b74537f1b3f3a4a08
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/smb.py
@@ -0,0 +1,333 @@
+"""
+This module contains SMBFileSystem class responsible for handling access to
+Windows Samba network shares by using package smbprotocol
+"""
+
+import datetime
+import uuid
+from stat import S_ISDIR, S_ISLNK
+
+import smbclient
+
+from .. import AbstractFileSystem
+from ..utils import infer_storage_options
+
+# ! pylint: disable=bad-continuation
+
+
+class SMBFileSystem(AbstractFileSystem):
+    """Allow reading and writing to Windows and Samba network shares.
+
+    When using `fsspec.open()` for getting a file-like object the URI
+    should be specified as this format:
+    ``smb://workgroup;user:password@server:port/share/folder/file.csv``.
+
+    Example::
+
+        >>> import fsspec
+        >>> with fsspec.open(
+        ...     'smb://myuser:mypassword@myserver.com/' 'share/folder/file.csv'
+        ... ) as smbfile:
+        ...     df = pd.read_csv(smbfile, sep='|', header=None)
+
+    Note that you need to pass in a valid hostname or IP address for the host
+    component of the URL. Do not use the Windows/NetBIOS machine name for the
+    host component.
+
+    The first component of the path in the URL points to the name of the shared
+    folder. Subsequent path components will point to the directory/folder/file.
+
+    The URL components ``workgroup`` , ``user``, ``password`` and ``port`` may be
+    optional.
+
+    .. note::
+
+        For working this source require `smbprotocol`_ to be installed, e.g.::
+
+            $ pip install smbprotocol
+            # or
+            # pip install smbprotocol[kerberos]
+
+    .. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements
+
+    Note: if using this with the ``open`` or ``open_files``, with full URLs,
+    there is no way to tell if a path is relative, so all paths are assumed
+    to be absolute.
+    """
+
+    protocol = "smb"
+
+    # pylint: disable=too-many-arguments
+    def __init__(
+        self,
+        host,
+        port=None,
+        username=None,
+        password=None,
+        timeout=60,
+        encrypt=None,
+        share_access=None,
+        register_session_retries=5,
+        **kwargs,
+    ):
+        """
+        You can use _get_kwargs_from_urls to get some kwargs from
+        a reasonable SMB url.
+
+        Authentication will be anonymous or integrated if username/password are not
+        given.
+
+        Parameters
+        ----------
+        host: str
+            The remote server name/ip to connect to
+        port: int or None
+            Port to connect with. Usually 445, sometimes 139.
+        username: str or None
+            Username to connect with. Required if Kerberos auth is not being used.
+        password: str or None
+            User's password on the server, if using username
+        timeout: int
+            Connection timeout in seconds
+        encrypt: bool
+            Whether to force encryption or not, once this has been set to True
+            the session cannot be changed back to False.
+        share_access: str or None
+            Specifies the default access applied to file open operations
+            performed with this file system object.
+            This affects whether other processes can concurrently open a handle
+            to the same file.
+
+            - None (the default): exclusively locks the file until closed.
+            - 'r': Allow other handles to be opened with read access.
+            - 'w': Allow other handles to be opened with write access.
+            - 'd': Allow other handles to be opened with delete access.
+        """
+        super().__init__(**kwargs)
+        self.host = host
+        self.port = port
+        self.username = username
+        self.password = password
+        self.timeout = timeout
+        self.encrypt = encrypt
+        self.temppath = kwargs.pop("temppath", "")
+        self.share_access = share_access
+        self.register_session_retries = register_session_retries
+        self._connect()
+
+    @property
+    def _port(self):
+        return 445 if self.port is None else self.port
+
+    def _connect(self):
+        import time
+
+        for _ in range(self.register_session_retries):
+            try:
+                smbclient.register_session(
+                    self.host,
+                    username=self.username,
+                    password=self.password,
+                    port=self._port,
+                    encrypt=self.encrypt,
+                    connection_timeout=self.timeout,
+                )
+                break
+            except Exception:
+                time.sleep(0.1)
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        return infer_storage_options(path)["path"]
+
+    @staticmethod
+    def _get_kwargs_from_urls(path):
+        # smb://workgroup;user:password@host:port/share/folder/file.csv
+        out = infer_storage_options(path)
+        out.pop("path", None)
+        out.pop("protocol", None)
+        return out
+
+    def mkdir(self, path, create_parents=True, **kwargs):
+        wpath = _as_unc_path(self.host, path)
+        if create_parents:
+            smbclient.makedirs(wpath, exist_ok=False, port=self._port, **kwargs)
+        else:
+            smbclient.mkdir(wpath, port=self._port, **kwargs)
+
+    def makedirs(self, path, exist_ok=False):
+        if _share_has_path(path):
+            wpath = _as_unc_path(self.host, path)
+            smbclient.makedirs(wpath, exist_ok=exist_ok, port=self._port)
+
+    def rmdir(self, path):
+        if _share_has_path(path):
+            wpath = _as_unc_path(self.host, path)
+            smbclient.rmdir(wpath, port=self._port)
+
+    def info(self, path, **kwargs):
+        wpath = _as_unc_path(self.host, path)
+        stats = smbclient.stat(wpath, port=self._port, **kwargs)
+        if S_ISDIR(stats.st_mode):
+            stype = "directory"
+        elif S_ISLNK(stats.st_mode):
+            stype = "link"
+        else:
+            stype = "file"
+        res = {
+            "name": path + "/" if stype == "directory" else path,
+            "size": stats.st_size,
+            "type": stype,
+            "uid": stats.st_uid,
+            "gid": stats.st_gid,
+            "time": stats.st_atime,
+            "mtime": stats.st_mtime,
+        }
+        return res
+
+    def created(self, path):
+        """Return the created timestamp of a file as a datetime.datetime"""
+        wpath = _as_unc_path(self.host, path)
+        stats = smbclient.stat(wpath, port=self._port)
+        return datetime.datetime.fromtimestamp(stats.st_ctime, tz=datetime.timezone.utc)
+
+    def modified(self, path):
+        """Return the modified timestamp of a file as a datetime.datetime"""
+        wpath = _as_unc_path(self.host, path)
+        stats = smbclient.stat(wpath, port=self._port)
+        return datetime.datetime.fromtimestamp(stats.st_mtime, tz=datetime.timezone.utc)
+
+    def ls(self, path, detail=True, **kwargs):
+        unc = _as_unc_path(self.host, path)
+        listed = smbclient.listdir(unc, port=self._port, **kwargs)
+        dirs = ["/".join([path.rstrip("/"), p]) for p in listed]
+        if detail:
+            dirs = [self.info(d) for d in dirs]
+        return dirs
+
+    # pylint: disable=too-many-arguments
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=-1,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        """
+        block_size: int or None
+            If 0, no buffering, 1, line buffering, >1, buffer that many bytes
+
+        Notes
+        -----
+        By specifying 'share_access' in 'kwargs' it is possible to override the
+        default shared access setting applied in the constructor of this object.
+        """
+        bls = block_size if block_size is not None and block_size >= 0 else -1
+        wpath = _as_unc_path(self.host, path)
+        share_access = kwargs.pop("share_access", self.share_access)
+        if "w" in mode and autocommit is False:
+            temp = _as_temp_path(self.host, path, self.temppath)
+            return SMBFileOpener(
+                wpath, temp, mode, port=self._port, block_size=bls, **kwargs
+            )
+        return smbclient.open_file(
+            wpath,
+            mode,
+            buffering=bls,
+            share_access=share_access,
+            port=self._port,
+            **kwargs,
+        )
+
+    def copy(self, path1, path2, **kwargs):
+        """Copy within two locations in the same filesystem"""
+        wpath1 = _as_unc_path(self.host, path1)
+        wpath2 = _as_unc_path(self.host, path2)
+        smbclient.copyfile(wpath1, wpath2, port=self._port, **kwargs)
+
+    def _rm(self, path):
+        if _share_has_path(path):
+            wpath = _as_unc_path(self.host, path)
+            stats = smbclient.stat(wpath, port=self._port)
+            if S_ISDIR(stats.st_mode):
+                smbclient.rmdir(wpath, port=self._port)
+            else:
+                smbclient.remove(wpath, port=self._port)
+
+    def mv(self, path1, path2, recursive=None, maxdepth=None, **kwargs):
+        wpath1 = _as_unc_path(self.host, path1)
+        wpath2 = _as_unc_path(self.host, path2)
+        smbclient.rename(wpath1, wpath2, port=self._port, **kwargs)
+
+
+def _as_unc_path(host, path):
+    rpath = path.replace("/", "\\")
+    unc = f"\\\\{host}{rpath}"
+    return unc
+
+
+def _as_temp_path(host, path, temppath):
+    share = path.split("/")[1]
+    temp_file = f"/{share}{temppath}/{uuid.uuid4()}"
+    unc = _as_unc_path(host, temp_file)
+    return unc
+
+
+def _share_has_path(path):
+    parts = path.count("/")
+    if path.endswith("/"):
+        return parts > 2
+    return parts > 1
+
+
+class SMBFileOpener:
+    """writes to remote temporary file, move on commit"""
+
+    def __init__(self, path, temp, mode, port=445, block_size=-1, **kwargs):
+        self.path = path
+        self.temp = temp
+        self.mode = mode
+        self.block_size = block_size
+        self.kwargs = kwargs
+        self.smbfile = None
+        self._incontext = False
+        self.port = port
+        self._open()
+
+    def _open(self):
+        if self.smbfile is None or self.smbfile.closed:
+            self.smbfile = smbclient.open_file(
+                self.temp,
+                self.mode,
+                port=self.port,
+                buffering=self.block_size,
+                **self.kwargs,
+            )
+
+    def commit(self):
+        """Move temp file to definitive on success."""
+        # TODO: use transaction support in SMB protocol
+        smbclient.replace(self.temp, self.path, port=self.port)
+
+    def discard(self):
+        """Remove the temp file on failure."""
+        smbclient.remove(self.temp, port=self.port)
+
+    def __fspath__(self):
+        return self.path
+
+    def __iter__(self):
+        return self.smbfile.__iter__()
+
+    def __getattr__(self, item):
+        return getattr(self.smbfile, item)
+
+    def __enter__(self):
+        self._incontext = True
+        return self.smbfile.__enter__()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._incontext = False
+        self.smbfile.__exit__(exc_type, exc_value, traceback)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py
new file mode 100644
index 0000000000000000000000000000000000000000..412e5ba4d2cdea7db090dc96412e697909a38d78
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/tar.py
@@ -0,0 +1,124 @@
+import logging
+import tarfile
+
+import fsspec
+from fsspec.archive import AbstractArchiveFileSystem
+from fsspec.compression import compr
+from fsspec.utils import infer_compression
+
+typemap = {b"0": "file", b"5": "directory"}
+
+logger = logging.getLogger("tar")
+
+
+class TarFileSystem(AbstractArchiveFileSystem):
+    """Compressed Tar archives as a file-system (read-only)
+
+    Supports the following formats:
+    tar.gz, tar.bz2, tar.xz
+    """
+
+    root_marker = ""
+    protocol = "tar"
+    cachable = False
+
+    def __init__(
+        self,
+        fo="",
+        index_store=None,
+        target_options=None,
+        target_protocol=None,
+        compression=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        target_options = target_options or {}
+
+        if isinstance(fo, str):
+            self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
+            fo = self.of.open()  # keep the reference
+
+        # Try to infer compression.
+        if compression is None:
+            name = None
+
+            # Try different ways to get hold of the filename. `fo` might either
+            # be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
+            # `fsspec.AbstractFileSystem` instance.
+            try:
+                # Amended io.BufferedReader or similar.
+                # This uses a "protocol extension" where original filenames are
+                # propagated to archive-like filesystems in order to let them
+                # infer the right compression appropriately.
+                if hasattr(fo, "original"):
+                    name = fo.original
+
+                # fsspec.LocalFileOpener
+                elif hasattr(fo, "path"):
+                    name = fo.path
+
+                # io.BufferedReader
+                elif hasattr(fo, "name"):
+                    name = fo.name
+
+                # fsspec.AbstractFileSystem
+                elif hasattr(fo, "info"):
+                    name = fo.info()["name"]
+
+            except Exception as ex:
+                logger.warning(
+                    f"Unable to determine file name, not inferring compression: {ex}"
+                )
+
+            if name is not None:
+                compression = infer_compression(name)
+                logger.info(f"Inferred compression {compression} from file name {name}")
+
+        if compression is not None:
+            # TODO: tarfile already implements compression with modes like "'r:gz'",
+            #  but then would seek to offset in the file work?
+            fo = compr[compression](fo)
+
+        self._fo_ref = fo
+        self.fo = fo  # the whole instance is a context
+        self.tar = tarfile.TarFile(fileobj=self.fo)
+        self.dir_cache = None
+
+        self.index_store = index_store
+        self.index = None
+        self._index()
+
+    def _index(self):
+        # TODO: load and set saved index, if exists
+        out = {}
+        for ti in self.tar:
+            info = ti.get_info()
+            info["type"] = typemap.get(info["type"], "file")
+            name = ti.get_info()["name"].rstrip("/")
+            out[name] = (info, ti.offset_data)
+
+        self.index = out
+        # TODO: save index to self.index_store here, if set
+
+    def _get_dirs(self):
+        if self.dir_cache is not None:
+            return
+
+        # This enables ls to get directories as children as well as files
+        self.dir_cache = {
+            dirname: {"name": dirname, "size": 0, "type": "directory"}
+            for dirname in self._all_dirnames(self.tar.getnames())
+        }
+        for member in self.tar.getmembers():
+            info = member.get_info()
+            info["name"] = info["name"].rstrip("/")
+            info["type"] = typemap.get(info["type"], "file")
+            self.dir_cache[info["name"]] = info
+
+    def _open(self, path, mode="rb", **kwargs):
+        if mode != "rb":
+            raise ValueError("Read-only filesystem implementation")
+        details, offset = self.index[path]
+        if details["type"] != "file":
+            raise ValueError("Can only handle regular files")
+        return self.tar.extractfile(path)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/webhdfs.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/webhdfs.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bac5d51aa52ccfa3319d86c8c8cd384497881a6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/webhdfs.py
@@ -0,0 +1,484 @@
+# https://hadoop.apache.org/docs/r1.0.4/webhdfs.html
+
+import logging
+import os
+import secrets
+import shutil
+import tempfile
+import uuid
+from contextlib import suppress
+from urllib.parse import quote
+
+import requests
+
+from ..spec import AbstractBufferedFile, AbstractFileSystem
+from ..utils import infer_storage_options, tokenize
+
+logger = logging.getLogger("webhdfs")
+
+
+class WebHDFS(AbstractFileSystem):
+    """
+    Interface to HDFS over HTTP using the WebHDFS API. Supports also HttpFS gateways.
+
+    Four auth mechanisms are supported:
+
+    insecure: no auth is done, and the user is assumed to be whoever they
+        say they are (parameter ``user``), or a predefined value such as
+        "dr.who" if not given
+    spnego: when kerberos authentication is enabled, auth is negotiated by
+        requests_kerberos https://github.com/requests/requests-kerberos .
+        This establishes a session based on existing kinit login and/or
+        specified principal/password; parameters are passed with ``kerb_kwargs``
+    token: uses an existing Hadoop delegation token from another secured
+        service. Indeed, this client can also generate such tokens when
+        not insecure. Note that tokens expire, but can be renewed (by a
+        previously specified user) and may allow for proxying.
+    basic-auth: used when both parameter ``user`` and parameter ``password``
+        are provided.
+
+    """
+
+    tempdir = str(tempfile.gettempdir())
+    protocol = "webhdfs", "webHDFS"
+
+    def __init__(
+        self,
+        host,
+        port=50070,
+        kerberos=False,
+        token=None,
+        user=None,
+        password=None,
+        proxy_to=None,
+        kerb_kwargs=None,
+        data_proxy=None,
+        use_https=False,
+        session_cert=None,
+        session_verify=True,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        host: str
+            Name-node address
+        port: int
+            Port for webHDFS
+        kerberos: bool
+            Whether to authenticate with kerberos for this connection
+        token: str or None
+            If given, use this token on every call to authenticate. A user
+            and user-proxy may be encoded in the token and should not be also
+            given
+        user: str or None
+            If given, assert the user name to connect with
+        password: str or None
+            If given, assert the password to use for basic auth. If password
+            is provided, user must be provided also
+        proxy_to: str or None
+            If given, the user has the authority to proxy, and this value is
+            the user in who's name actions are taken
+        kerb_kwargs: dict
+            Any extra arguments for HTTPKerberosAuth, see
+            `<https://github.com/requests/requests-kerberos/blob/master/requests_kerberos/kerberos_.py>`_
+        data_proxy: dict, callable or None
+            If given, map data-node addresses. This can be necessary if the
+            HDFS cluster is behind a proxy, running on Docker or otherwise has
+            a mismatch between the host-names given by the name-node and the
+            address by which to refer to them from the client. If a dict,
+            maps host names ``host->data_proxy[host]``; if a callable, full
+            URLs are passed, and function must conform to
+            ``url->data_proxy(url)``.
+        use_https: bool
+            Whether to connect to the Name-node using HTTPS instead of HTTP
+        session_cert: str or Tuple[str, str] or None
+            Path to a certificate file, or tuple of (cert, key) files to use
+            for the requests.Session
+        session_verify: str, bool or None
+            Path to a certificate file to use for verifying the requests.Session.
+        kwargs
+        """
+        if self._cached:
+            return
+        super().__init__(**kwargs)
+        self.url = f"{'https' if use_https else 'http'}://{host}:{port}/webhdfs/v1"  # noqa
+        self.kerb = kerberos
+        self.kerb_kwargs = kerb_kwargs or {}
+        self.pars = {}
+        self.proxy = data_proxy or {}
+        if token is not None:
+            if user is not None or proxy_to is not None:
+                raise ValueError(
+                    "If passing a delegation token, must not set "
+                    "user or proxy_to, as these are encoded in the"
+                    " token"
+                )
+            self.pars["delegation"] = token
+        self.user = user
+        self.password = password
+
+        if password is not None:
+            if user is None:
+                raise ValueError(
+                    "If passing a password, the user must also be"
+                    "set in order to set up the basic-auth"
+                )
+        else:
+            if user is not None:
+                self.pars["user.name"] = user
+
+        if proxy_to is not None:
+            self.pars["doas"] = proxy_to
+        if kerberos and user is not None:
+            raise ValueError(
+                "If using Kerberos auth, do not specify the "
+                "user, this is handled by kinit."
+            )
+
+        self.session_cert = session_cert
+        self.session_verify = session_verify
+
+        self._connect()
+
+        self._fsid = f"webhdfs_{tokenize(host, port)}"
+
+    @property
+    def fsid(self):
+        return self._fsid
+
+    def _connect(self):
+        self.session = requests.Session()
+
+        if self.session_cert:
+            self.session.cert = self.session_cert
+
+        self.session.verify = self.session_verify
+
+        if self.kerb:
+            from requests_kerberos import HTTPKerberosAuth
+
+            self.session.auth = HTTPKerberosAuth(**self.kerb_kwargs)
+
+        if self.user is not None and self.password is not None:
+            from requests.auth import HTTPBasicAuth
+
+            self.session.auth = HTTPBasicAuth(self.user, self.password)
+
+    def _call(self, op, method="get", path=None, data=None, redirect=True, **kwargs):
+        url = self._apply_proxy(self.url + quote(path or "", safe="/="))
+        args = kwargs.copy()
+        args.update(self.pars)
+        args["op"] = op.upper()
+        logger.debug("sending %s with %s", url, method)
+        out = self.session.request(
+            method=method.upper(),
+            url=url,
+            params=args,
+            data=data,
+            allow_redirects=redirect,
+        )
+        if out.status_code in [400, 401, 403, 404, 500]:
+            try:
+                err = out.json()
+                msg = err["RemoteException"]["message"]
+                exp = err["RemoteException"]["exception"]
+            except (ValueError, KeyError):
+                pass
+            else:
+                if exp in ["IllegalArgumentException", "UnsupportedOperationException"]:
+                    raise ValueError(msg)
+                elif exp in ["SecurityException", "AccessControlException"]:
+                    raise PermissionError(msg)
+                elif exp in ["FileNotFoundException"]:
+                    raise FileNotFoundError(msg)
+                else:
+                    raise RuntimeError(msg)
+        out.raise_for_status()
+        return out
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        replication=None,
+        permissions=None,
+        **kwargs,
+    ):
+        """
+
+        Parameters
+        ----------
+        path: str
+            File location
+        mode: str
+            'rb', 'wb', etc.
+        block_size: int
+            Client buffer size for read-ahead or write buffer
+        autocommit: bool
+            If False, writes to temporary file that only gets put in final
+            location upon commit
+        replication: int
+            Number of copies of file on the cluster, write mode only
+        permissions: str or int
+            posix permissions, write mode only
+        kwargs
+
+        Returns
+        -------
+        WebHDFile instance
+        """
+        block_size = block_size or self.blocksize
+        return WebHDFile(
+            self,
+            path,
+            mode=mode,
+            block_size=block_size,
+            tempdir=self.tempdir,
+            autocommit=autocommit,
+            replication=replication,
+            permissions=permissions,
+        )
+
+    @staticmethod
+    def _process_info(info):
+        info["type"] = info["type"].lower()
+        info["size"] = info["length"]
+        return info
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        return infer_storage_options(path)["path"]
+
+    @staticmethod
+    def _get_kwargs_from_urls(urlpath):
+        out = infer_storage_options(urlpath)
+        out.pop("path", None)
+        out.pop("protocol", None)
+        if "username" in out:
+            out["user"] = out.pop("username")
+        return out
+
+    def info(self, path):
+        out = self._call("GETFILESTATUS", path=path)
+        info = out.json()["FileStatus"]
+        info["name"] = path
+        return self._process_info(info)
+
+    def ls(self, path, detail=False):
+        out = self._call("LISTSTATUS", path=path)
+        infos = out.json()["FileStatuses"]["FileStatus"]
+        for info in infos:
+            self._process_info(info)
+            info["name"] = path.rstrip("/") + "/" + info["pathSuffix"]
+        if detail:
+            return sorted(infos, key=lambda i: i["name"])
+        else:
+            return sorted(info["name"] for info in infos)
+
+    def content_summary(self, path):
+        """Total numbers of files, directories and bytes under path"""
+        out = self._call("GETCONTENTSUMMARY", path=path)
+        return out.json()["ContentSummary"]
+
+    def ukey(self, path):
+        """Checksum info of file, giving method and result"""
+        out = self._call("GETFILECHECKSUM", path=path, redirect=False)
+        if "Location" in out.headers:
+            location = self._apply_proxy(out.headers["Location"])
+            out2 = self.session.get(location)
+            out2.raise_for_status()
+            return out2.json()["FileChecksum"]
+        else:
+            out.raise_for_status()
+            return out.json()["FileChecksum"]
+
+    def home_directory(self):
+        """Get user's home directory"""
+        out = self._call("GETHOMEDIRECTORY")
+        return out.json()["Path"]
+
+    def get_delegation_token(self, renewer=None):
+        """Retrieve token which can give the same authority to other uses
+
+        Parameters
+        ----------
+        renewer: str or None
+            User who may use this token; if None, will be current user
+        """
+        if renewer:
+            out = self._call("GETDELEGATIONTOKEN", renewer=renewer)
+        else:
+            out = self._call("GETDELEGATIONTOKEN")
+        t = out.json()["Token"]
+        if t is None:
+            raise ValueError("No token available for this user/security context")
+        return t["urlString"]
+
+    def renew_delegation_token(self, token):
+        """Make token live longer. Returns new expiry time"""
+        out = self._call("RENEWDELEGATIONTOKEN", method="put", token=token)
+        return out.json()["long"]
+
+    def cancel_delegation_token(self, token):
+        """Stop the token from being useful"""
+        self._call("CANCELDELEGATIONTOKEN", method="put", token=token)
+
+    def chmod(self, path, mod):
+        """Set the permission at path
+
+        Parameters
+        ----------
+        path: str
+            location to set (file or directory)
+        mod: str or int
+            posix epresentation or permission, give as oct string, e.g, '777'
+            or 0o777
+        """
+        self._call("SETPERMISSION", method="put", path=path, permission=mod)
+
+    def chown(self, path, owner=None, group=None):
+        """Change owning user and/or group"""
+        kwargs = {}
+        if owner is not None:
+            kwargs["owner"] = owner
+        if group is not None:
+            kwargs["group"] = group
+        self._call("SETOWNER", method="put", path=path, **kwargs)
+
+    def set_replication(self, path, replication):
+        """
+        Set file replication factor
+
+        Parameters
+        ----------
+        path: str
+            File location (not for directories)
+        replication: int
+            Number of copies of file on the cluster. Should be smaller than
+            number of data nodes; normally 3 on most systems.
+        """
+        self._call("SETREPLICATION", path=path, method="put", replication=replication)
+
+    def mkdir(self, path, **kwargs):
+        self._call("MKDIRS", method="put", path=path)
+
+    def makedirs(self, path, exist_ok=False):
+        if exist_ok is False and self.exists(path):
+            raise FileExistsError(path)
+        self.mkdir(path)
+
+    def mv(self, path1, path2, **kwargs):
+        self._call("RENAME", method="put", path=path1, destination=path2)
+
+    def rm(self, path, recursive=False, **kwargs):
+        self._call(
+            "DELETE",
+            method="delete",
+            path=path,
+            recursive="true" if recursive else "false",
+        )
+
+    def rm_file(self, path, **kwargs):
+        self.rm(path)
+
+    def cp_file(self, lpath, rpath, **kwargs):
+        with self.open(lpath) as lstream:
+            tmp_fname = "/".join([self._parent(rpath), f".tmp.{secrets.token_hex(16)}"])
+            # Perform an atomic copy (stream to a temporary file and
+            # move it to the actual destination).
+            try:
+                with self.open(tmp_fname, "wb") as rstream:
+                    shutil.copyfileobj(lstream, rstream)
+                self.mv(tmp_fname, rpath)
+            except BaseException:  # noqa
+                with suppress(FileNotFoundError):
+                    self.rm(tmp_fname)
+                raise
+
+    def _apply_proxy(self, location):
+        if self.proxy and callable(self.proxy):
+            location = self.proxy(location)
+        elif self.proxy:
+            # as a dict
+            for k, v in self.proxy.items():
+                location = location.replace(k, v, 1)
+        return location
+
+
+class WebHDFile(AbstractBufferedFile):
+    """A file living in HDFS over webHDFS"""
+
+    def __init__(self, fs, path, **kwargs):
+        super().__init__(fs, path, **kwargs)
+        kwargs = kwargs.copy()
+        if kwargs.get("permissions", None) is None:
+            kwargs.pop("permissions", None)
+        if kwargs.get("replication", None) is None:
+            kwargs.pop("replication", None)
+        self.permissions = kwargs.pop("permissions", 511)
+        tempdir = kwargs.pop("tempdir")
+        if kwargs.pop("autocommit", False) is False:
+            self.target = self.path
+            self.path = os.path.join(tempdir, str(uuid.uuid4()))
+
+    def _upload_chunk(self, final=False):
+        """Write one part of a multi-block file upload
+
+        Parameters
+        ==========
+        final: bool
+            This is the last block, so should complete file, if
+            self.autocommit is True.
+        """
+        out = self.fs.session.post(
+            self.location,
+            data=self.buffer.getvalue(),
+            headers={"content-type": "application/octet-stream"},
+        )
+        out.raise_for_status()
+        return True
+
+    def _initiate_upload(self):
+        """Create remote file/upload"""
+        kwargs = self.kwargs.copy()
+        if "a" in self.mode:
+            op, method = "APPEND", "POST"
+        else:
+            op, method = "CREATE", "PUT"
+            kwargs["overwrite"] = "true"
+        out = self.fs._call(op, method, self.path, redirect=False, **kwargs)
+        location = self.fs._apply_proxy(out.headers["Location"])
+        if "w" in self.mode:
+            # create empty file to append to
+            out2 = self.fs.session.put(
+                location, headers={"content-type": "application/octet-stream"}
+            )
+            out2.raise_for_status()
+            # after creating empty file, change location to append to
+            out2 = self.fs._call("APPEND", "POST", self.path, redirect=False, **kwargs)
+            self.location = self.fs._apply_proxy(out2.headers["Location"])
+
+    def _fetch_range(self, start, end):
+        start = max(start, 0)
+        end = min(self.size, end)
+        if start >= end or start >= self.size:
+            return b""
+        out = self.fs._call(
+            "OPEN", path=self.path, offset=start, length=end - start, redirect=False
+        )
+        out.raise_for_status()
+        if "Location" in out.headers:
+            location = out.headers["Location"]
+            out2 = self.fs.session.get(self.fs._apply_proxy(location))
+            return out2.content
+        else:
+            return out.content
+
+    def commit(self):
+        self.fs.mv(self.path, self.target)
+
+    def discard(self):
+        self.fs.rm(self.path)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/zip.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/zip.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d9c046bfde313b6868399c4d200bc779c1ab19f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/implementations/zip.py
@@ -0,0 +1,134 @@
+import zipfile
+
+import fsspec
+from fsspec.archive import AbstractArchiveFileSystem
+
+
+class ZipFileSystem(AbstractArchiveFileSystem):
+    """Read/Write contents of ZIP archive as a file-system
+
+    Keeps file object open while instance lives.
+
+    This class is pickleable, but not necessarily thread-safe
+    """
+
+    root_marker = ""
+    protocol = "zip"
+    cachable = False
+
+    def __init__(
+        self,
+        fo="",
+        mode="r",
+        target_protocol=None,
+        target_options=None,
+        compression=zipfile.ZIP_STORED,
+        allowZip64=True,
+        compresslevel=None,
+        **kwargs,
+    ):
+        """
+        Parameters
+        ----------
+        fo: str or file-like
+            Contains ZIP, and must exist. If a str, will fetch file using
+            :meth:`~fsspec.open_files`, which must return one file exactly.
+        mode: str
+            Accept: "r", "w", "a"
+        target_protocol: str (optional)
+            If ``fo`` is a string, this value can be used to override the
+            FS protocol inferred from a URL
+        target_options: dict (optional)
+            Kwargs passed when instantiating the target FS, if ``fo`` is
+            a string.
+        compression, allowZip64, compresslevel: passed to ZipFile
+            Only relevant when creating a ZIP
+        """
+        super().__init__(self, **kwargs)
+        if mode not in set("rwa"):
+            raise ValueError(f"mode '{mode}' no understood")
+        self.mode = mode
+        if isinstance(fo, str):
+            if mode == "a":
+                m = "r+b"
+            else:
+                m = mode + "b"
+            fo = fsspec.open(
+                fo, mode=m, protocol=target_protocol, **(target_options or {})
+            )
+        self.force_zip_64 = allowZip64
+        self.of = fo
+        self.fo = fo.__enter__()  # the whole instance is a context
+        self.zip = zipfile.ZipFile(
+            self.fo,
+            mode=mode,
+            compression=compression,
+            allowZip64=allowZip64,
+            compresslevel=compresslevel,
+        )
+        self.dir_cache = None
+
+    @classmethod
+    def _strip_protocol(cls, path):
+        # zip file paths are always relative to the archive root
+        return super()._strip_protocol(path).lstrip("/")
+
+    def __del__(self):
+        if hasattr(self, "zip"):
+            self.close()
+            del self.zip
+
+    def close(self):
+        """Commits any write changes to the file. Done on ``del`` too."""
+        self.zip.close()
+
+    def _get_dirs(self):
+        if self.dir_cache is None or self.mode in set("wa"):
+            # when writing, dir_cache is always in the ZipFile's attributes,
+            # not read from the file.
+            files = self.zip.infolist()
+            self.dir_cache = {
+                dirname.rstrip("/"): {
+                    "name": dirname.rstrip("/"),
+                    "size": 0,
+                    "type": "directory",
+                }
+                for dirname in self._all_dirnames(self.zip.namelist())
+            }
+            for z in files:
+                f = {s: getattr(z, s, None) for s in zipfile.ZipInfo.__slots__}
+                f.update(
+                    {
+                        "name": z.filename.rstrip("/"),
+                        "size": z.file_size,
+                        "type": ("directory" if z.is_dir() else "file"),
+                    }
+                )
+                self.dir_cache[f["name"]] = f
+
+    def pipe_file(self, path, value, **kwargs):
+        # override upstream, because we know the exact file size in this case
+        self.zip.writestr(path, value, **kwargs)
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        path = self._strip_protocol(path)
+        if "r" in mode and self.mode in set("wa"):
+            if self.exists(path):
+                raise OSError("ZipFS can only be open for reading or writing, not both")
+            raise FileNotFoundError(path)
+        if "r" in self.mode and "w" in mode:
+            raise OSError("ZipFS can only be open for reading or writing, not both")
+        out = self.zip.open(path, mode.strip("b"), force_zip64=self.force_zip_64)
+        if "r" in mode:
+            info = self.info(path)
+            out.size = info["size"]
+            out.name = info["name"]
+        return out
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/conftest.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb1efb0414e77679a86ca80af89e32655df306c4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/conftest.py
@@ -0,0 +1,188 @@
+import contextlib
+import gzip
+import json
+import os
+import threading
+from collections import ChainMap
+from http.server import BaseHTTPRequestHandler, HTTPServer
+
+import pytest
+
+requests = pytest.importorskip("requests")
+port = 9898
+data = b"\n".join([b"some test data"] * 1000)
+realfile = f"http://127.0.0.1:{port}/index/realfile"
+index = b'<a href="%s">Link</a>' % realfile.encode()
+listing = open(
+    os.path.join(os.path.dirname(__file__), "data", "listing.html"), "rb"
+).read()
+win = os.name == "nt"
+
+
+def _make_listing(*paths):
+    return "\n".join(
+        f'<a href="http://127.0.0.1:{port}{f}">Link_{i}</a>'
+        for i, f in enumerate(paths)
+    ).encode()
+
+
+@pytest.fixture
+def reset_files():
+    yield
+
+    # Reset the newly added files after the
+    # test is completed.
+    HTTPTestHandler.dynamic_files.clear()
+
+
+class HTTPTestHandler(BaseHTTPRequestHandler):
+    static_files = {
+        "/index/realfile": data,
+        "/index/otherfile": data,
+        "/index": index,
+        "/data/20020401": listing,
+        "/simple/": _make_listing("/simple/file", "/simple/dir/"),
+        "/simple/file": data,
+        "/simple/dir/": _make_listing("/simple/dir/file"),
+        "/simple/dir/file": data,
+    }
+    dynamic_files = {}
+
+    files = ChainMap(dynamic_files, static_files)
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def _respond(self, code=200, headers=None, data=b""):
+        headers = headers or {}
+        headers.update({"User-Agent": "test"})
+        self.send_response(code)
+        for k, v in headers.items():
+            self.send_header(k, str(v))
+        self.end_headers()
+        if data:
+            self.wfile.write(data)
+
+    def do_GET(self):
+        file_path = self.path
+        if file_path.endswith("/") and file_path.rstrip("/") in self.files:
+            file_path = file_path.rstrip("/")
+        file_data = self.files.get(file_path)
+        if "give_path" in self.headers:
+            return self._respond(200, data=json.dumps({"path": self.path}).encode())
+        if "redirect" in self.headers and file_path != "/index/realfile":
+            new_url = f"http://127.0.0.1:{port}/index/realfile"
+            return self._respond(301, {"Location": new_url})
+        if file_data is None:
+            return self._respond(404)
+
+        status = 200
+        content_range = f"bytes 0-{len(file_data) - 1}/{len(file_data)}"
+        if ("Range" in self.headers) and ("ignore_range" not in self.headers):
+            ran = self.headers["Range"]
+            b, ran = ran.split("=")
+            start, end = ran.split("-")
+            if start:
+                content_range = f"bytes {start}-{end}/{len(file_data)}"
+                file_data = file_data[int(start) : (int(end) + 1) if end else None]
+            else:
+                # suffix only
+                l = len(file_data)
+                content_range = f"bytes {l - int(end)}-{l - 1}/{l}"
+                file_data = file_data[-int(end) :]
+            if "use_206" in self.headers:
+                status = 206
+        if "give_length" in self.headers:
+            if "gzip_encoding" in self.headers:
+                file_data = gzip.compress(file_data)
+                response_headers = {
+                    "Content-Length": len(file_data),
+                    "Content-Encoding": "gzip",
+                }
+            else:
+                response_headers = {"Content-Length": len(file_data)}
+            self._respond(status, response_headers, file_data)
+        elif "give_range" in self.headers:
+            self._respond(status, {"Content-Range": content_range}, file_data)
+        elif "give_mimetype" in self.headers:
+            self._respond(
+                status, {"Content-Type": "text/html; charset=utf-8"}, file_data
+            )
+        else:
+            self._respond(status, data=file_data)
+
+    def do_POST(self):
+        length = self.headers.get("Content-Length")
+        file_path = self.path.rstrip("/")
+        if length is None:
+            assert self.headers.get("Transfer-Encoding") == "chunked"
+            self.files[file_path] = b"".join(self.read_chunks())
+        else:
+            self.files[file_path] = self.rfile.read(length)
+        self._respond(200)
+
+    do_PUT = do_POST
+
+    def read_chunks(self):
+        length = -1
+        while length != 0:
+            line = self.rfile.readline().strip()
+            if len(line) == 0:
+                length = 0
+            else:
+                length = int(line, 16)
+            yield self.rfile.read(length)
+            self.rfile.readline()
+
+    def do_HEAD(self):
+        if "head_not_auth" in self.headers:
+            return self._respond(
+                403, {"Content-Length": 123}, b"not authorized for HEAD request"
+            )
+        elif "head_ok" not in self.headers:
+            return self._respond(405)
+
+        file_path = self.path.rstrip("/")
+        file_data = self.files.get(file_path)
+        if file_data is None:
+            return self._respond(404)
+
+        if ("give_length" in self.headers) or ("head_give_length" in self.headers):
+            response_headers = {"Content-Length": len(file_data)}
+            if "zero_length" in self.headers:
+                response_headers["Content-Length"] = 0
+            elif "gzip_encoding" in self.headers:
+                file_data = gzip.compress(file_data)
+                response_headers["Content-Encoding"] = "gzip"
+                response_headers["Content-Length"] = len(file_data)
+
+            self._respond(200, response_headers)
+        elif "give_range" in self.headers:
+            self._respond(
+                200, {"Content-Range": f"0-{len(file_data) - 1}/{len(file_data)}"}
+            )
+        elif "give_etag" in self.headers:
+            self._respond(200, {"ETag": "xxx"})
+        else:
+            self._respond(200)  # OK response, but no useful info
+
+
+@contextlib.contextmanager
+def serve():
+    server_address = ("", port)
+    httpd = HTTPServer(server_address, HTTPTestHandler)
+    th = threading.Thread(target=httpd.serve_forever)
+    th.daemon = True
+    th.start()
+    try:
+        yield f"http://127.0.0.1:{port}"
+    finally:
+        httpd.socket.close()
+        httpd.shutdown()
+        th.join()
+
+
+@pytest.fixture(scope="module")
+def server():
+    with serve() as s:
+        yield s
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_api.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..24e6101f0988a90f842df12dcc44fdb8cabab7e6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_api.py
@@ -0,0 +1,498 @@
+"""Tests the spec, using memoryfs"""
+
+import contextlib
+import os
+import pickle
+import tempfile
+from unittest.mock import Mock
+
+import pytest
+
+import fsspec
+from fsspec.implementations.memory import MemoryFile, MemoryFileSystem
+
+
+def test_idempotent():
+    MemoryFileSystem.clear_instance_cache()
+    fs = MemoryFileSystem()
+    fs2 = MemoryFileSystem()
+    assert fs is fs2
+    assert MemoryFileSystem.current() is fs2
+
+    MemoryFileSystem.clear_instance_cache()
+    assert not MemoryFileSystem._cache
+
+    fs2 = MemoryFileSystem().current()
+    assert fs == fs2
+
+
+def test_pickle():
+    fs = MemoryFileSystem()
+    fs2 = pickle.loads(pickle.dumps(fs))
+    assert fs == fs2
+
+
+def test_class_methods():
+    assert MemoryFileSystem._strip_protocol("memory://stuff") == "/stuff"
+    assert MemoryFileSystem._strip_protocol("stuff") == "/stuff"
+    assert MemoryFileSystem._strip_protocol("other://stuff") == "other://stuff"
+
+    assert MemoryFileSystem._get_kwargs_from_urls("memory://user@thing") == {}
+
+
+def test_multi(m):
+    m.pipe("/afile", b"data")
+    fs, token, paths = fsspec.core.get_fs_token_paths(["/afile", "/afile"])
+    assert len(paths) == 2
+
+
+def test_get_put(tmpdir, m):
+    tmpdir = str(tmpdir)
+    fn = os.path.join(tmpdir, "one")
+    open(fn, "wb").write(b"one")
+    os.mkdir(os.path.join(tmpdir, "dir"))
+    fn2 = os.path.join(tmpdir, "dir", "two")
+    open(fn2, "wb").write(b"two")
+
+    fs = MemoryFileSystem()
+    fs.put(fn, "/afile")
+    assert fs.cat("/afile") == b"one"
+
+    fs.store["/bfile"] = MemoryFile(fs, "/bfile", b"data")
+    fn3 = os.path.join(tmpdir, "three")
+    fs.get("/bfile", fn3)
+    assert open(fn3, "rb").read() == b"data"
+
+    fs.put(tmpdir, "/more", recursive=True)
+    assert fs.find("/more") == ["/more/dir/two", "/more/one", "/more/three"]
+
+    @contextlib.contextmanager
+    def tmp_chdir(path):
+        curdir = os.getcwd()
+        os.chdir(path)
+        try:
+            yield
+        finally:
+            os.chdir(curdir)
+
+    with tmp_chdir(os.path.join(tmpdir, os.path.pardir)):
+        fs.put(os.path.basename(tmpdir), "/moretwo", recursive=True)
+        assert fs.find("/moretwo") == [
+            "/moretwo/dir/two",
+            "/moretwo/one",
+            "/moretwo/three",
+        ]
+
+    with tmp_chdir(tmpdir):
+        fs.put(os.path.curdir, "/morethree", recursive=True)
+        assert fs.find("/morethree") == [
+            "/morethree/dir/two",
+            "/morethree/one",
+            "/morethree/three",
+        ]
+
+    for f in [fn, fn2, fn3]:
+        os.remove(f)
+    os.rmdir(os.path.join(tmpdir, "dir"))
+
+    fs.get("/more/", tmpdir + "/", recursive=True)
+    assert open(fn3, "rb").read() == b"data"
+    assert open(fn, "rb").read() == b"one"
+
+
+def test_du(m):
+    fs = MemoryFileSystem()
+    fs.store.update(
+        {
+            "/dir/afile": MemoryFile(fs, "/afile", b"a"),
+            "/dir/dirb/afile": MemoryFile(fs, "/afile", b"bb"),
+            "/dir/dirb/bfile": MemoryFile(fs, "/afile", b"ccc"),
+        }
+    )
+    assert fs.du("/dir") == 6
+    assert fs.du("/dir", total=False) == {
+        "/dir/afile": 1,
+        "/dir/dirb/afile": 2,
+        "/dir/dirb/bfile": 3,
+    }
+    assert fs.du("/dir", withdirs=True) == 6
+    assert fs.du("/dir", total=False, withdirs=True) == {
+        "/dir": 0,
+        "/dir/afile": 1,
+        "/dir/dirb": 0,
+        "/dir/dirb/afile": 2,
+        "/dir/dirb/bfile": 3,
+    }
+    with pytest.raises(ValueError):
+        assert fs.du("/dir", maxdepth=0) == 1
+    assert fs.du("/dir", total=False, withdirs=True, maxdepth=1) == {
+        "/dir": 0,
+        "/dir/afile": 1,
+        "/dir/dirb": 0,
+    }
+
+    # Size of file only.
+    assert fs.du("/dir/afile") == 1
+    assert fs.du("/dir/afile", withdirs=True) == 1
+
+
+def test_head_tail(m):
+    fs = MemoryFileSystem()
+    with fs.open("/myfile", "wb") as f:
+        f.write(b"I had a nice big cabbage")
+    assert fs.head("/myfile", 5) == b"I had"
+    assert fs.tail("/myfile", 7) == b"cabbage"
+
+
+def test_move(m):
+    fs = MemoryFileSystem()
+    with fs.open("/myfile", "wb") as f:
+        f.write(b"I had a nice big cabbage")
+    fs.move("/myfile", "/otherfile")
+    assert not fs.exists("/myfile")
+    assert fs.info("/otherfile")
+    assert isinstance(fs.ukey("/otherfile"), str)
+
+
+def test_recursive_get_put(tmpdir, m):
+    fs = MemoryFileSystem()
+    os.makedirs(f"{tmpdir}/nest")
+    for file in ["one", "two", "nest/other"]:
+        with open(f"{tmpdir}/{file}", "wb") as f:
+            f.write(b"data")
+
+    fs.put(str(tmpdir), "test", recursive=True)
+
+    # get to directory with slash
+    d = tempfile.mkdtemp()
+    fs.get("test/", d, recursive=True)
+    for file in ["one", "two", "nest/other"]:
+        with open(f"{d}/{file}", "rb") as f:
+            f.read() == b"data"
+
+    # get to directory without slash
+    d = tempfile.mkdtemp()
+    fs.get("test", d, recursive=True)
+    for file in ["test/one", "test/two", "test/nest/other"]:
+        with open(f"{d}/{file}", "rb") as f:
+            f.read() == b"data"
+
+
+def test_pipe_cat(m):
+    fs = MemoryFileSystem()
+    fs.pipe("afile", b"contents")
+    assert fs.cat("afile") == b"contents"
+
+    data = {"/bfile": b"more", "/cfile": b"stuff"}
+    fs.pipe(data)
+    assert fs.cat(list(data)) == data
+
+
+def test_read_block_delimiter(m):
+    fs = MemoryFileSystem()
+    with fs.open("/myfile", "wb") as f:
+        f.write(b"some\nlines\nof\ntext")
+    assert fs.read_block("/myfile", 0, 2, b"\n") == b"some\n"
+    assert fs.read_block("/myfile", 2, 6, b"\n") == b"lines\n"
+    assert fs.read_block("/myfile", 6, 2, b"\n") == b""
+    assert fs.read_block("/myfile", 2, 9, b"\n") == b"lines\nof\n"
+    assert fs.read_block("/myfile", 12, 6, b"\n") == b"text"
+    assert fs.read_block("/myfile", 0, None) == fs.cat("/myfile")
+
+
+def test_open_text(m):
+    fs = MemoryFileSystem()
+    with fs.open("/myfile", "wb") as f:
+        f.write(b"some\nlines\nof\ntext")
+    f = fs.open("/myfile", "r", encoding="latin1")
+    assert f.encoding == "latin1"
+
+
+def test_read_text(m):
+    with m.open("/myfile", "w", encoding="utf-8") as f:
+        f.write("some\nlines\nof\ntext")
+    assert m.read_text("/myfile", encoding="utf-8") == "some\nlines\nof\ntext"
+
+
+def test_write_text(m):
+    m.write_text("/myfile", "some\nlines\nof\ntext", encoding="utf-8")
+    assert m.read_text("/myfile", encoding="utf-8") == "some\nlines\nof\ntext"
+
+
+def test_chained_fs():
+    d1 = tempfile.mkdtemp()
+    d2 = tempfile.mkdtemp()
+    f1 = os.path.join(d1, "f1")
+    with open(f1, "wb") as f:
+        f.write(b"test")
+
+    of = fsspec.open(
+        f"simplecache::file://{f1}",
+        simplecache={"cache_storage": d2, "same_names": True},
+    )
+    with of as f:
+        assert f.read() == b"test"
+
+    assert os.listdir(d2) == ["f1"]
+
+
+@pytest.mark.xfail(reason="see issue #334", strict=True)
+def test_multilevel_chained_fs():
+    """This test reproduces fsspec/filesystem_spec#334"""
+    import zipfile
+
+    d1 = tempfile.mkdtemp()
+    f1 = os.path.join(d1, "f1.zip")
+    with zipfile.ZipFile(f1, mode="w") as z:
+        # filename, content
+        z.writestr("foo.txt", "foo.txt")
+        z.writestr("bar.txt", "bar.txt")
+
+    # We expected this to be the correct syntax
+    with pytest.raises(IsADirectoryError):
+        of = fsspec.open_files(f"zip://*.txt::simplecache::file://{f1}")
+        assert len(of) == 2
+
+    # But this is what is actually valid...
+    of = fsspec.open_files(f"zip://*.txt::simplecache://{f1}::file://")
+
+    assert len(of) == 2
+    for open_file in of:
+        with open_file as f:
+            assert f.read().decode("utf-8") == f.name
+
+
+def test_multilevel_chained_fs_zip_zip_file():
+    """This test reproduces fsspec/filesystem_spec#334"""
+    import zipfile
+
+    d1 = tempfile.mkdtemp()
+    f1 = os.path.join(d1, "f1.zip")
+    f2 = os.path.join(d1, "f2.zip")
+    with zipfile.ZipFile(f1, mode="w") as z:
+        # filename, content
+        z.writestr("foo.txt", "foo.txt")
+        z.writestr("bar.txt", "bar.txt")
+
+    with zipfile.ZipFile(f2, mode="w") as z:
+        with open(f1, "rb") as f:
+            z.writestr("f1.zip", f.read())
+
+    # We expected this to be the correct syntax
+    of = fsspec.open_files(f"zip://*.txt::zip://f1.zip::file://{f2}")
+
+    assert len(of) == 2
+    for open_file in of:
+        with open_file as f:
+            assert f.read().decode("utf-8") == f.name
+
+
+def test_chained_equivalent():
+    d1 = tempfile.mkdtemp()
+    d2 = tempfile.mkdtemp()
+    f1 = os.path.join(d1, "f1")
+    with open(f1, "wb") as f:
+        f.write(b"test1")
+
+    of = fsspec.open(
+        f"simplecache::file://{f1}",
+        simplecache={"cache_storage": d2, "same_names": True},
+    )
+    of2 = fsspec.open(
+        f"simplecache://{f1}",
+        cache_storage=d2,
+        same_names=True,
+        target_protocol="file",
+        target_options={},
+    )
+    # the following line passes by fluke - they are not quite the same instance,
+    #  since the parameters don't quite match. Also, the url understood by the two
+    #  of s are not the same (path gets munged a bit differently)
+    assert of.fs == of2.fs
+    assert hash(of.fs) == hash(of2.fs)
+    assert of.open().read() == of2.open().read()
+
+
+def test_chained_fs_multi():
+    d1 = tempfile.mkdtemp()
+    d2 = tempfile.mkdtemp()
+    f1 = os.path.join(d1, "f1")
+    f2 = os.path.join(d1, "f2")
+    with open(f1, "wb") as f:
+        f.write(b"test1")
+    with open(f2, "wb") as f:
+        f.write(b"test2")
+
+    of = fsspec.open_files(
+        f"simplecache::file://{d1}/*",
+        simplecache={"cache_storage": d2, "same_names": True},
+    )
+    with of[0] as f:
+        assert f.read() == b"test1"
+    with of[1] as f:
+        assert f.read() == b"test2"
+
+    assert sorted(os.listdir(d2)) == ["f1", "f2"]
+
+    d2 = tempfile.mkdtemp()
+
+    of = fsspec.open_files(
+        [f"simplecache::file://{f1}", f"simplecache::file://{f2}"],
+        simplecache={"cache_storage": d2, "same_names": True},
+    )
+    with of[0] as f:
+        assert f.read() == b"test1"
+    with of[1] as f:
+        assert f.read() == b"test2"
+
+    assert sorted(os.listdir(d2)) == ["f1", "f2"]
+
+
+def test_chained_fo():
+    import zipfile
+
+    d1 = tempfile.mkdtemp()
+    f1 = os.path.join(d1, "temp.zip")
+    d3 = tempfile.mkdtemp()
+    with zipfile.ZipFile(f1, mode="w") as z:
+        z.writestr("afile", b"test")
+
+    of = fsspec.open(f"zip://afile::file://{f1}")
+    with of as f:
+        assert f.read() == b"test"
+
+    of = fsspec.open_files(f"zip://*::file://{f1}")
+    with of[0] as f:
+        assert f.read() == b"test"
+
+    of = fsspec.open_files(
+        f"simplecache::zip://*::file://{f1}",
+        simplecache={"cache_storage": d3, "same_names": True},
+    )
+    with of[0] as f:
+        assert f.read() == b"test"
+    assert "afile" in os.listdir(d3)
+
+
+def test_url_to_fs():
+    url = "memory://a.txt"
+    fs, url2 = fsspec.core.url_to_fs(url)
+
+    assert isinstance(fs, MemoryFileSystem)
+    assert url2 == "/a.txt"
+
+
+def test_walk(m):
+    # depth = 0
+    dir1 = "/dir1"
+    # depth = 1 (2 dirs, 1 file)
+    dir11 = dir1 + "/dir11"
+    dir12 = dir1 + "/dir12"
+    file11 = dir1 + "/file11"
+    # depth = 2
+    dir111 = dir11 + "/dir111"
+    file111 = dir11 + "/file111"
+    file121 = dir12 + "/file121"
+    # depth = 3
+    file1111 = dir111 + "/file1111"
+
+    m.mkdir(dir111)  # Creates parents too
+    m.mkdir(dir12)  # Creates parents too
+    m.touch(file11)
+    m.touch(file111)
+    m.touch(file121)
+    m.touch(file1111)
+
+    # No maxdepth
+    assert list(m.walk(dir1, topdown=True)) == [
+        (dir1, ["dir11", "dir12"], ["file11"]),
+        (dir11, ["dir111"], ["file111"]),
+        (dir111, [], ["file1111"]),
+        (dir12, [], ["file121"]),
+    ]
+    assert list(m.walk(dir1, topdown=False)) == [
+        (dir111, [], ["file1111"]),
+        (dir11, ["dir111"], ["file111"]),
+        (dir12, [], ["file121"]),
+        (dir1, ["dir11", "dir12"], ["file11"]),
+    ]
+
+    # maxdepth=2
+    assert list(m.walk(dir1, maxdepth=2, topdown=True)) == [
+        (dir1, ["dir11", "dir12"], ["file11"]),
+        (dir11, ["dir111"], ["file111"]),
+        (dir12, [], ["file121"]),
+    ]
+    assert list(m.walk(dir1, maxdepth=2, topdown=False)) == [
+        (dir11, ["dir111"], ["file111"]),
+        (dir12, [], ["file121"]),
+        (dir1, ["dir11", "dir12"], ["file11"]),
+    ]
+
+    # maxdepth=1
+    assert list(m.walk(dir1, maxdepth=1, topdown=True)) == [
+        (dir1, ["dir11", "dir12"], ["file11"]),
+    ]
+    assert list(m.walk(dir1, maxdepth=1, topdown=False)) == [
+        (dir1, ["dir11", "dir12"], ["file11"]),
+    ]
+
+    # maxdepth=0
+    with pytest.raises(ValueError):
+        list(m.walk(dir1, maxdepth=0, topdown=True))
+    with pytest.raises(ValueError):
+        list(m.walk(dir1, maxdepth=0, topdown=False))
+
+    # prune dir111
+    def _walk(*args, **kwargs):
+        for path, dirs, files in m.walk(*args, **kwargs):
+            yield (path, dirs.copy(), files)
+            if "dir111" in dirs:
+                dirs.remove("dir111")
+
+    assert list(_walk(dir1, topdown=True)) == [
+        (dir1, ["dir11", "dir12"], ["file11"]),
+        (dir11, ["dir111"], ["file111"]),
+        (dir12, [], ["file121"]),
+    ]
+    assert list(_walk(dir1, topdown=False)) == [
+        (dir111, [], ["file1111"]),
+        (dir11, ["dir111"], ["file111"]),
+        (dir12, [], ["file121"]),
+        (dir1, ["dir11", "dir12"], ["file11"]),
+    ]
+
+    # reverse dirs order
+    def _walk(*args, **kwargs):
+        for path, dirs, files in m.walk(*args, **kwargs):
+            yield (path, dirs.copy(), files)
+            dirs.reverse()
+
+    assert list(_walk(dir1, topdown=True)) == [
+        (dir1, ["dir11", "dir12"], ["file11"]),
+        # Here dir12 comes before dir11
+        (dir12, [], ["file121"]),
+        (dir11, ["dir111"], ["file111"]),
+        (dir111, [], ["file1111"]),
+    ]
+    assert list(_walk(dir1, topdown=False)) == [
+        (dir111, [], ["file1111"]),
+        (dir11, ["dir111"], ["file111"]),
+        (dir12, [], ["file121"]),
+        (dir1, ["dir11", "dir12"], ["file11"]),
+    ]
+
+    # on_error omit by default
+    assert list(m.walk("do_not_exist")) == []
+    # on_error omit
+    assert list(m.walk("do_not_exist", on_error="omit")) == []
+    # on_error raise
+    with pytest.raises(FileNotFoundError):
+        list(m.walk("do_not_exist", on_error="raise"))
+    # on_error callable function
+    mock = Mock()
+    assert list(m.walk("do_not_exist", on_error=mock.onerror)) == []
+    mock.onerror.assert_called()
+    assert mock.onerror.call_args.kwargs == {}
+    assert len(mock.onerror.call_args.args) == 1
+    assert isinstance(mock.onerror.call_args.args[0], FileNotFoundError)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_async.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_async.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1a29420fa0de7cd571f25556500645ad62be59d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_async.py
@@ -0,0 +1,230 @@
+import asyncio
+import inspect
+import io
+import os
+import time
+
+import pytest
+
+import fsspec
+import fsspec.asyn
+from fsspec.asyn import _run_coros_in_chunks
+
+
+def test_sync_methods():
+    inst = fsspec.asyn.AsyncFileSystem()
+    assert inspect.iscoroutinefunction(inst._info)
+    assert hasattr(inst, "info")
+    assert inst.info.__qualname__ == "AsyncFileSystem._info"
+    assert not inspect.iscoroutinefunction(inst.info)
+
+
+def test_when_sync_methods_are_disabled():
+    class TestFS(fsspec.asyn.AsyncFileSystem):
+        mirror_sync_methods = False
+
+    inst = TestFS()
+    assert inspect.iscoroutinefunction(inst._info)
+    assert not inspect.iscoroutinefunction(inst.info)
+    assert inst.info.__qualname__ == "AbstractFileSystem.info"
+
+
+def test_interrupt():
+    loop = fsspec.asyn.get_loop()
+
+    async def f():
+        await asyncio.sleep(1000000)
+        return True
+
+    fut = asyncio.run_coroutine_threadsafe(f(), loop)
+    time.sleep(0.01)  # task launches
+    out = fsspec.asyn._dump_running_tasks(with_task=True)
+    task = out[0]["task"]
+    assert task.done() and fut.done()
+    assert isinstance(fut.exception(), fsspec.asyn.FSSpecCoroutineCancel)
+
+
+class _DummyAsyncKlass:
+    def __init__(self):
+        self.loop = fsspec.asyn.get_loop()
+
+    async def _dummy_async_func(self):
+        # Sleep 1 second function to test timeout
+        await asyncio.sleep(1)
+        return True
+
+    async def _bad_multiple_sync(self):
+        fsspec.asyn.sync_wrapper(_DummyAsyncKlass._dummy_async_func)(self)
+        return True
+
+    dummy_func = fsspec.asyn.sync_wrapper(_dummy_async_func)
+    bad_multiple_sync_func = fsspec.asyn.sync_wrapper(_bad_multiple_sync)
+
+
+def test_sync_wrapper_timeout_on_less_than_expected_wait_time_not_finish_function():
+    test_obj = _DummyAsyncKlass()
+    with pytest.raises(fsspec.FSTimeoutError):
+        test_obj.dummy_func(timeout=0.1)
+
+
+def test_sync_wrapper_timeout_on_more_than_expected_wait_time_will_finish_function():
+    test_obj = _DummyAsyncKlass()
+    assert test_obj.dummy_func(timeout=5)
+
+
+def test_sync_wrapper_timeout_none_will_wait_func_finished():
+    test_obj = _DummyAsyncKlass()
+    assert test_obj.dummy_func(timeout=None)
+
+
+def test_sync_wrapper_treat_timeout_0_as_none():
+    test_obj = _DummyAsyncKlass()
+    assert test_obj.dummy_func(timeout=0)
+
+
+def test_sync_wrapper_bad_multiple_sync():
+    test_obj = _DummyAsyncKlass()
+    with pytest.raises(NotImplementedError):
+        test_obj.bad_multiple_sync_func(timeout=5)
+
+
+def test_run_coros_in_chunks(monkeypatch):
+    total_running = 0
+
+    async def runner():
+        nonlocal total_running
+
+        total_running += 1
+        await asyncio.sleep(0)
+        if total_running > 4:
+            raise ValueError("More than 4 coroutines are running together")
+        total_running -= 1
+        return 1
+
+    async def main(**kwargs):
+        nonlocal total_running
+
+        total_running = 0
+        coros = [runner() for _ in range(32)]
+        results = await _run_coros_in_chunks(coros, **kwargs)
+        for result in results:
+            if isinstance(result, Exception):
+                raise result
+        return results
+
+    assert sum(asyncio.run(main(batch_size=4))) == 32
+
+    with pytest.raises(ValueError):
+        asyncio.run(main(batch_size=5))
+
+    with pytest.raises(ValueError):
+        asyncio.run(main(batch_size=-1))
+
+    assert sum(asyncio.run(main(batch_size=4))) == 32
+
+    monkeypatch.setitem(fsspec.config.conf, "gather_batch_size", 5)
+    with pytest.raises(ValueError):
+        asyncio.run(main())
+    assert sum(asyncio.run(main(batch_size=4))) == 32  # override
+
+    monkeypatch.setitem(fsspec.config.conf, "gather_batch_size", 4)
+    assert sum(asyncio.run(main())) == 32  # override
+
+
+@pytest.mark.skipif(os.name != "nt", reason="only for windows")
+def test_windows_policy():
+    from asyncio.windows_events import SelectorEventLoop
+
+    loop = fsspec.asyn.get_loop()
+    policy = asyncio.get_event_loop_policy()
+
+    # Ensure that the created loop always uses selector policy
+    assert isinstance(loop, SelectorEventLoop)
+
+    # Ensure that the global policy is not changed and it is
+    # set to the default one. This is important since the
+    # get_loop() method will temporarily override the policy
+    # with the one which uses selectors on windows, so this
+    # check ensures that we are restoring the old policy back
+    # after our change.
+    assert isinstance(policy, asyncio.DefaultEventLoopPolicy)
+
+
+def test_running_async():
+    assert not fsspec.asyn.running_async()
+
+    async def go():
+        assert fsspec.asyn.running_async()
+
+    asyncio.run(go())
+
+
+class DummyAsyncFS(fsspec.asyn.AsyncFileSystem):
+    _file_class = fsspec.asyn.AbstractAsyncStreamedFile
+
+    async def _info(self, path, **kwargs):
+        return {"name": "misc/foo.txt", "type": "file", "size": 100}
+
+    async def open_async(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        return DummyAsyncStreamedFile(
+            self,
+            path,
+            mode,
+            block_size,
+            autocommit,
+            cache_options=cache_options,
+            **kwargs,
+        )
+
+
+class DummyAsyncStreamedFile(fsspec.asyn.AbstractAsyncStreamedFile):
+    def __init__(self, fs, path, mode, block_size, autocommit, **kwargs):
+        super().__init__(fs, path, mode, block_size, autocommit, **kwargs)
+        self.temp_buffer = io.BytesIO(b"foo-bar" * 20)
+
+    async def _fetch_range(self, start, end):
+        return self.temp_buffer.read(end - start)
+
+    async def _initiate_upload(self):
+        # Reinitialize for new uploads.
+        self.temp_buffer = io.BytesIO()
+
+    async def _upload_chunk(self, final=False):
+        self.temp_buffer.write(self.buffer.getbuffer())
+
+    async def get_data(self):
+        return self.temp_buffer.getbuffer().tobytes()
+
+    async def get_data(self):
+        return self.temp_buffer.getbuffer().tobytes()
+
+
+@pytest.mark.asyncio
+async def test_async_streamed_file_write():
+    test_fs = DummyAsyncFS()
+    streamed_file = await test_fs.open_async("misc/foo.txt", mode="wb")
+    inp_data = "foo-bar".encode("utf8") * streamed_file.blocksize * 2
+    await streamed_file.write(inp_data)
+    assert streamed_file.loc == len(inp_data)
+    await streamed_file.close()
+    out_data = await streamed_file.get_data()
+    assert out_data.count(b"foo-bar") == streamed_file.blocksize * 2
+
+
+@pytest.mark.asyncio
+async def test_async_streamed_file_read():
+    test_fs = DummyAsyncFS()
+    streamed_file = await test_fs.open_async("misc/foo.txt", mode="rb")
+    assert (
+        await streamed_file.read(7 * 3) + await streamed_file.read(7 * 18)
+        == b"foo-bar" * 20
+    )
+    await streamed_file.close()
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_caches.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_caches.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bde713b8e24777378ff913879e0e0ee505e502e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_caches.py
@@ -0,0 +1,255 @@
+import pickle
+import string
+
+import pytest
+
+from fsspec.caching import (
+    BlockCache,
+    FirstChunkCache,
+    ReadAheadCache,
+    caches,
+    register_cache,
+)
+from fsspec.implementations.cached import WholeFileCacheFileSystem
+
+
+def test_cache_getitem(Cache_imp):
+    cacher = Cache_imp(4, letters_fetcher, len(string.ascii_letters))
+    assert cacher._fetch(0, 4) == b"abcd"
+    assert cacher._fetch(None, 4) == b"abcd"
+    assert cacher._fetch(2, 4) == b"cd"
+
+
+def test_block_cache_lru():
+    # BlockCache is a cache that stores blocks of data and uses LRU to evict
+    block_size = 4
+    cache = BlockCache(
+        block_size, letters_fetcher, len(string.ascii_letters), maxblocks=2
+    )
+    # miss
+    cache._fetch(0, 2)
+    assert cache.cache_info().misses == 1
+    assert cache.cache_info().currsize == 1
+    assert cache.total_requested_bytes == block_size * cache.miss_count
+    assert cache.size == 52
+
+    # hit
+    cache._fetch(0, 2)
+    assert cache.cache_info().misses == 1
+    assert cache.cache_info().currsize == 1
+    assert cache.total_requested_bytes == block_size * cache.miss_count
+
+    # hit
+    cache._fetch(0, 2)
+    assert cache.cache_info().misses == 1
+    assert cache.cache_info().currsize == 1
+    # this works as a counter since all the reads are from the cache
+    assert cache.hit_count == 3
+    assert cache.miss_count == 1
+    # so far only 4 bytes have been read using range requests
+    assert cache.total_requested_bytes == block_size * cache.miss_count
+
+    # miss
+    cache._fetch(4, 6)
+    assert cache.cache_info().misses == 2
+    assert cache.cache_info().currsize == 2
+    assert cache.total_requested_bytes == block_size * cache.miss_count
+
+    # miss & evict
+    cache._fetch(12, 13)
+    assert cache.cache_info().misses == 3
+    assert cache.cache_info().currsize == 2
+    assert cache.hit_count == 5
+    assert cache.miss_count == 3
+    assert cache.total_requested_bytes == block_size * cache.miss_count
+
+
+def test_first_cache():
+    """
+    FirstChunkCache is a cache that only caches the first chunk of data
+    when some of that first block is requested.
+    """
+    block_size = 5
+    cache = FirstChunkCache(block_size, letters_fetcher, len(string.ascii_letters))
+    assert cache.cache is None
+    assert cache._fetch(12, 15) == letters_fetcher(12, 15)
+    assert cache.miss_count == 1
+    assert cache.hit_count == 0
+    assert cache.cache is None
+    total_requested_bytes = 15 - 12
+    assert cache.total_requested_bytes == total_requested_bytes
+
+    # because we overlap with the cache range, it will be cached
+    assert cache._fetch(3, 10) == letters_fetcher(3, 10)
+    assert cache.miss_count == 2
+    assert cache.hit_count == 0
+    # we'll read the first 5 and then the rest
+    total_requested_bytes += block_size + 5
+    assert cache.total_requested_bytes == total_requested_bytes
+
+    # partial hit again
+    assert cache._fetch(3, 10) == letters_fetcher(3, 10)
+    assert cache.miss_count == 2
+    assert cache.hit_count == 1
+    # we have the first 5 bytes cached
+    total_requested_bytes += 10 - 5
+    assert cache.total_requested_bytes == total_requested_bytes
+
+    assert cache.cache == letters_fetcher(0, 5)
+    assert cache._fetch(0, 4) == letters_fetcher(0, 4)
+    assert cache.hit_count == 2
+    assert cache.miss_count == 2
+    assert cache.total_requested_bytes == 18
+
+
+def test_readahead_cache():
+    """
+    ReadAheadCache is a cache that reads ahead of the requested range.
+    If the access pattern is not sequential it will be very inefficient.
+    """
+    block_size = 5
+    cache = ReadAheadCache(block_size, letters_fetcher, len(string.ascii_letters))
+    assert cache._fetch(12, 15) == letters_fetcher(12, 15)
+    assert cache.miss_count == 1
+    assert cache.hit_count == 0
+    total_requested_bytes = 15 - 12 + block_size
+    assert cache.total_requested_bytes == total_requested_bytes
+
+    assert cache._fetch(3, 10) == letters_fetcher(3, 10)
+    assert cache.miss_count == 2
+    assert cache.hit_count == 0
+    assert len(cache.cache) == 12
+    total_requested_bytes += (10 - 3) + block_size
+    assert cache.total_requested_bytes == total_requested_bytes
+
+    # caache hit again
+    assert cache._fetch(3, 10) == letters_fetcher(3, 10)
+    assert cache.miss_count == 2
+    assert cache.hit_count == 1
+    assert len(cache.cache) == 12
+    assert cache.total_requested_bytes == total_requested_bytes
+    assert cache.cache == letters_fetcher(3, 15)
+
+    # cache miss
+    assert cache._fetch(0, 4) == letters_fetcher(0, 4)
+    assert cache.hit_count == 1
+    assert cache.miss_count == 3
+    assert len(cache.cache) == 9
+    total_requested_bytes += (4 - 0) + block_size
+    assert cache.total_requested_bytes == total_requested_bytes
+
+
+def _fetcher(start, end):
+    return b"0" * (end - start)
+
+
+def letters_fetcher(start, end):
+    return string.ascii_letters[start:end].encode()
+
+
+not_parts_caches = {k: v for k, v in caches.items() if k != "parts"}
+
+
+@pytest.fixture(params=not_parts_caches.values(), ids=list(not_parts_caches))
+def Cache_imp(request):
+    return request.param
+
+
+def test_cache_empty_file(Cache_imp):
+    blocksize = 5
+    size = 0
+    cache = Cache_imp(blocksize, _fetcher, size)
+    assert cache._fetch(0, 0) == b""
+
+
+def test_cache_pickleable(Cache_imp):
+    blocksize = 5
+    size = 100
+    cache = Cache_imp(blocksize, _fetcher, size)
+    cache._fetch(0, 5)  # fill in cache
+    unpickled = pickle.loads(pickle.dumps(cache))
+    assert isinstance(unpickled, Cache_imp)
+    assert unpickled.blocksize == blocksize
+    assert unpickled.size == size
+    assert unpickled._fetch(0, 10) == b"0" * 10
+
+
+@pytest.mark.parametrize(
+    "size_requests",
+    [[(0, 30), (0, 35), (51, 52)], [(0, 1), (1, 11), (1, 52)], [(0, 52), (11, 15)]],
+)
+@pytest.mark.parametrize("blocksize", [1, 10, 52, 100])
+def test_cache_basic(Cache_imp, blocksize, size_requests):
+    cache = Cache_imp(blocksize, letters_fetcher, len(string.ascii_letters))
+
+    for start, end in size_requests:
+        result = cache._fetch(start, end)
+        expected = string.ascii_letters[start:end].encode()
+        assert result == expected
+
+
+@pytest.mark.parametrize("strict", [True, False])
+@pytest.mark.parametrize("sort", [True, False])
+def test_known(sort, strict):
+    parts = {(10, 20): b"1" * 10, (20, 30): b"2" * 10, (0, 10): b"0" * 10}
+    if sort:
+        parts = dict(sorted(parts.items()))
+    c = caches["parts"](None, None, 100, parts, strict=strict)
+    assert (0, 30) in c.data  # got consolidated
+    assert c._fetch(5, 15) == b"0" * 5 + b"1" * 5
+    assert c._fetch(15, 25) == b"1" * 5 + b"2" * 5
+    if strict:
+        # Over-read will raise error
+        with pytest.raises(ValueError):
+            # tries to call None fetcher
+            c._fetch(25, 35)
+    else:
+        # Over-read will be zero-padded
+        assert c._fetch(25, 35) == b"2" * 5 + b"\x00" * 5
+
+
+def test_background(server, monkeypatch):
+    import threading
+    import time
+
+    import fsspec
+
+    head = {"head_ok": "true", "head_give_length": "true"}
+    urla = server + "/index/realfile"
+    h = fsspec.filesystem("http", headers=head)
+    thread_ids = {threading.current_thread().ident}
+    f = h.open(urla, block_size=5, cache_type="background")
+    orig = f.cache._fetch_block
+
+    def wrapped(*a, **kw):
+        thread_ids.add(threading.current_thread().ident)
+        return orig(*a, **kw)
+
+    f.cache._fetch_block = wrapped
+    assert len(thread_ids) == 1
+    f.read(1)
+    time.sleep(0.1)  # second block is loading
+    assert len(thread_ids) == 2
+
+
+def test_register_cache():
+    # just test that we have them populated and fail to re-add again unless overload
+    with pytest.raises(ValueError):
+        register_cache(BlockCache)
+    register_cache(BlockCache, clobber=True)
+
+
+def test_cache_kwargs(mocker):
+    # test that kwargs are passed to the underlying filesystem after cache commit
+
+    fs = WholeFileCacheFileSystem(target_protocol="memory")
+    fs.touch("test")
+    fs.fs.put = mocker.MagicMock()
+
+    with fs.open("test", "wb", overwrite=True) as file_handle:
+        file_handle.write(b"foo")
+
+    # We don't care about the first parameter, just retrieve its expected value.
+    # It is a random location that cannot be predicted.
+    # The important thing is the 'overwrite' kwarg
+    fs.fs.put.assert_called_with(fs.fs.put.call_args[0][0], "/test", overwrite=True)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_callbacks.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_callbacks.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cc679d0299ed4ecf4407be3e0a0f85cfa766feb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_callbacks.py
@@ -0,0 +1,89 @@
+import pytest
+
+from fsspec.callbacks import Callback, TqdmCallback
+
+
+def test_callbacks():
+    empty_callback = Callback()
+    assert empty_callback.call("something", somearg=None) is None
+
+    hooks = {"something": lambda *_, arg=None: arg + 2}
+    simple_callback = Callback(hooks=hooks)
+    assert simple_callback.call("something", arg=2) == 4
+
+    hooks = {"something": lambda *_, arg1=None, arg2=None: arg1 + arg2}
+    multi_arg_callback = Callback(hooks=hooks)
+    assert multi_arg_callback.call("something", arg1=2, arg2=2) == 4
+
+
+def test_callbacks_as_callback():
+    empty_callback = Callback.as_callback(None)
+    assert empty_callback.call("something", arg="somearg") is None
+    assert Callback.as_callback(None) is Callback.as_callback(None)
+
+    hooks = {"something": lambda *_, arg=None: arg + 2}
+    real_callback = Callback.as_callback(Callback(hooks=hooks))
+    assert real_callback.call("something", arg=2) == 4
+
+
+def test_callbacks_as_context_manager(mocker):
+    spy_close = mocker.spy(Callback, "close")
+
+    with Callback() as cb:
+        assert isinstance(cb, Callback)
+
+    spy_close.assert_called_once()
+
+
+def test_callbacks_branched():
+    callback = Callback()
+
+    branch = callback.branched("path_1", "path_2")
+
+    assert branch is not callback
+    assert isinstance(branch, Callback)
+
+
+@pytest.mark.asyncio
+async def test_callbacks_branch_coro(mocker):
+    async_fn = mocker.AsyncMock(return_value=10)
+    callback = Callback()
+    wrapped_fn = callback.branch_coro(async_fn)
+    spy = mocker.spy(callback, "branched")
+
+    assert await wrapped_fn("path_1", "path_2", key="value") == 10
+
+    spy.assert_called_once_with("path_1", "path_2", key="value")
+    async_fn.assert_called_once_with(
+        "path_1", "path_2", callback=spy.spy_return, key="value"
+    )
+
+
+def test_callbacks_wrap():
+    events = []
+
+    class TestCallback(Callback):
+        def relative_update(self, inc=1):
+            events.append(inc)
+
+    callback = TestCallback()
+    for _ in callback.wrap(range(10)):
+        ...
+
+    assert events == [1] * 10
+
+
+@pytest.mark.parametrize("tqdm_kwargs", [{}, {"desc": "A custom desc"}])
+def test_tqdm_callback(tqdm_kwargs, mocker):
+    pytest.importorskip("tqdm")
+    callback = TqdmCallback(tqdm_kwargs=tqdm_kwargs)
+    mocker.patch.object(callback, "_tqdm_cls")
+    callback.set_size(10)
+    for _ in callback.wrap(range(10)):
+        ...
+
+    assert callback.tqdm.update.call_count == 11
+    if not tqdm_kwargs:
+        callback._tqdm_cls.assert_called_with(total=10)
+    else:
+        callback._tqdm_cls.assert_called_with(total=10, **tqdm_kwargs)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_compression.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_compression.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b670ce128a98201bc7aa36190fe0e52a1a4d486
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_compression.py
@@ -0,0 +1,164 @@
+import pathlib
+
+import pytest
+
+import fsspec.core
+from fsspec.compression import compr, register_compression
+from fsspec.utils import compressions, infer_compression
+
+
+def test_infer_custom_compression():
+    """Inferred compression gets values from fsspec.compression.compr."""
+    assert infer_compression("fn.zip") == "zip"
+    assert infer_compression("fn.gz") == "gzip"
+    assert infer_compression("fn.unknown") is None
+    assert infer_compression("fn.test_custom") is None
+    assert infer_compression("fn.tst") is None
+
+    register_compression("test_custom", lambda f, **kwargs: f, "tst")
+
+    try:
+        assert infer_compression("fn.zip") == "zip"
+        assert infer_compression("fn.gz") == "gzip"
+        assert infer_compression("fn.unknown") is None
+        assert infer_compression("fn.test_custom") is None
+        assert infer_compression("fn.tst") == "test_custom"
+
+        # Duplicate registration in name or extension raises a value error.
+        with pytest.raises(ValueError):
+            register_compression("test_custom", lambda f, **kwargs: f, "tst")
+
+        with pytest.raises(ValueError):
+            register_compression("test_conflicting", lambda f, **kwargs: f, "tst")
+        assert "test_conflicting" not in compr
+
+        # ...but can be forced.
+        register_compression(
+            "test_conflicting", lambda f, **kwargs: f, "tst", force=True
+        )
+        assert infer_compression("fn.zip") == "zip"
+        assert infer_compression("fn.gz") == "gzip"
+        assert infer_compression("fn.unknown") is None
+        assert infer_compression("fn.test_custom") is None
+        assert infer_compression("fn.tst") == "test_conflicting"
+
+    finally:
+        del compr["test_custom"]
+        del compr["test_conflicting"]
+        del compressions["tst"]
+
+
+def test_infer_uppercase_compression():
+    assert infer_compression("fn.ZIP") == "zip"
+    assert infer_compression("fn.GZ") == "gzip"
+    assert infer_compression("fn.UNKNOWN") is None
+    assert infer_compression("fn.TEST_UPPERCASE") is None
+    assert infer_compression("fn.TEST") is None
+
+
+def test_lzma_compression_name():
+    pytest.importorskip("lzma")
+    assert infer_compression("fn.xz") == "xz"
+    assert infer_compression("fn.lzma") == "lzma"
+
+
+def test_lz4_compression(tmpdir):
+    """Infer lz4 compression for .lz4 files if lz4 is available."""
+    tmp_path = pathlib.Path(str(tmpdir))
+
+    lz4 = pytest.importorskip("lz4")
+
+    tmp_path.mkdir(exist_ok=True)
+
+    tdat = "foobar" * 100
+
+    with fsspec.core.open(
+        str(tmp_path / "out.lz4"), mode="wt", compression="infer"
+    ) as outfile:
+        outfile.write(tdat)
+
+    compressed = (tmp_path / "out.lz4").open("rb").read()
+    assert lz4.frame.decompress(compressed).decode() == tdat
+
+    with fsspec.core.open(
+        str(tmp_path / "out.lz4"), mode="rt", compression="infer"
+    ) as infile:
+        assert infile.read() == tdat
+
+    with fsspec.core.open(
+        str(tmp_path / "out.lz4"), mode="rt", compression="lz4"
+    ) as infile:
+        assert infile.read() == tdat
+
+
+def test_zstd_compression(tmpdir):
+    """Infer zstd compression for .zst files if zstandard is available."""
+    tmp_path = pathlib.Path(str(tmpdir))
+
+    zstd = pytest.importorskip("zstandard")
+
+    tmp_path.mkdir(exist_ok=True)
+
+    tdat = "foobar" * 100
+
+    with fsspec.core.open(
+        str(tmp_path / "out.zst"), mode="wt", compression="infer"
+    ) as outfile:
+        outfile.write(tdat)
+
+    compressed = (tmp_path / "out.zst").open("rb").read()
+    assert zstd.ZstdDecompressor().decompress(compressed, len(tdat)).decode() == tdat
+
+    with fsspec.core.open(
+        str(tmp_path / "out.zst"), mode="rt", compression="infer"
+    ) as infile:
+        assert infile.read() == tdat
+
+    with fsspec.core.open(
+        str(tmp_path / "out.zst"), mode="rt", compression="zstd"
+    ) as infile:
+        assert infile.read() == tdat
+
+    # fails in https://github.com/fsspec/filesystem_spec/issues/725
+    infile = fsspec.core.open(
+        str(tmp_path / "out.zst"), mode="rb", compression="infer"
+    ).open()
+
+    infile.close()
+
+
+def test_snappy_compression(tmpdir):
+    """No registered compression for snappy, but can be specified."""
+    tmp_path = pathlib.Path(str(tmpdir))
+
+    snappy = pytest.importorskip("snappy")
+
+    tmp_path.mkdir(exist_ok=True)
+
+    tdat = "foobar" * 100
+
+    # Snappy isn't inferred.
+    with fsspec.core.open(
+        str(tmp_path / "out.snappy"), mode="wt", compression="infer"
+    ) as outfile:
+        outfile.write(tdat)
+    assert (tmp_path / "out.snappy").open("rb").read().decode() == tdat
+
+    # but can be specified.
+    with fsspec.core.open(
+        str(tmp_path / "out.snappy"), mode="wt", compression="snappy"
+    ) as outfile:
+        outfile.write(tdat)
+
+    compressed = (tmp_path / "out.snappy").open("rb").read()
+    assert snappy.StreamDecompressor().decompress(compressed).decode() == tdat
+
+    with fsspec.core.open(
+        str(tmp_path / "out.snappy"), mode="rb", compression="infer"
+    ) as infile:
+        assert infile.read() == compressed
+
+    with fsspec.core.open(
+        str(tmp_path / "out.snappy"), mode="rt", compression="snappy"
+    ) as infile:
+        assert infile.read() == tdat
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_config.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc251a0d91fbed1bdec6df0b185d1607cdb2b82f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_config.py
@@ -0,0 +1,129 @@
+import os
+from warnings import catch_warnings
+
+import pytest
+
+import fsspec
+from fsspec.config import conf, set_conf_env, set_conf_files
+
+
+@pytest.fixture
+def clean_conf():
+    """Tests should start and end with clean config dict"""
+    conf.clear()
+    yield
+    conf.clear()
+
+
+def test_from_env_ignored(clean_conf):
+    env = {
+        "FSSPEC": "missing_protocol",
+        "FSSPEC_": "missing_protocol",
+        "FSSPEC__INVALID_KEY": "invalid_protocol",
+        "FSSPEC_INVALID1": "not_json_dict",
+        "FSSPEC_INVALID2": '["not_json_dict"]',
+    }
+    cd = {}
+    with catch_warnings(record=True) as w:
+        set_conf_env(conf_dict=cd, envdict=env)
+        assert len(w) == 5
+        assert "unexpected name" in str(w[0].message)
+        assert "unexpected name" in str(w[1].message)
+        assert "unexpected name" in str(w[2].message)
+        assert "parse failure" in str(w[3].message)
+        assert "not being a dict" in str(w[4].message)
+    assert cd == {}
+
+
+def test_from_env_kwargs(clean_conf):
+    env = {
+        "FSSPEC_PROTO_KEY": "value",
+        "FSSPEC_PROTO_LONG_KEY": "othervalue",
+        "FSSPEC_MALFORMED": "novalue",
+    }
+    cd = {}
+    with catch_warnings(record=True) as w:
+        set_conf_env(conf_dict=cd, envdict=env)
+        assert len(w) == 1
+        assert "parse failure" in str(w[0].message)
+    assert cd == {"proto": {"key": "value", "long_key": "othervalue"}}
+
+
+def test_from_env_protocol_dict(clean_conf):
+    env = {
+        "FSSPEC_PROTO": '{"int": 1, "float": 2.3, "bool": true, "dict": {"key": "val"}}'
+    }
+    cd = {}
+    set_conf_env(conf_dict=cd, envdict=env)
+    assert cd == {
+        "proto": {"int": 1, "float": 2.3, "bool": True, "dict": {"key": "val"}}
+    }
+
+
+def test_from_env_kwargs_override_protocol_dict(clean_conf):
+    env = {
+        "FSSPEC_PROTO_LONG_KEY": "override1",
+        "FSSPEC_PROTO": '{"key": "value1", "long_key": "value2", "otherkey": "value3"}',
+        "FSSPEC_PROTO_KEY": "override2",
+    }
+    cd = {}
+    set_conf_env(conf_dict=cd, envdict=env)
+    assert cd == {
+        "proto": {"key": "override2", "long_key": "override1", "otherkey": "value3"}
+    }
+
+
+def test_from_file_ini(clean_conf, tmpdir):
+    file1 = os.path.join(tmpdir, "1.ini")
+    file2 = os.path.join(tmpdir, "2.ini")
+    with open(file1, "w") as f:
+        f.write(
+            """[proto]
+key=value
+other_key:othervalue
+overwritten=dont_see
+        """
+        )
+    with open(file2, "w") as f:
+        f.write(
+            """[proto]
+overwritten=see
+        """
+        )
+    cd = {}
+    set_conf_files(tmpdir, cd)
+    assert cd == {
+        "proto": {"key": "value", "other_key": "othervalue", "overwritten": "see"}
+    }
+
+
+def test_from_file_json(clean_conf, tmpdir):
+    file1 = os.path.join(tmpdir, "1.json")
+    file2 = os.path.join(tmpdir, "2.json")
+    with open(file1, "w") as f:
+        f.write(
+            """{"proto":
+{"key": "value",
+"other_key": "othervalue",
+"overwritten": false}}
+        """
+        )
+    with open(file2, "w") as f:
+        f.write(
+            """{"proto":
+{"overwritten": true}}
+        """
+        )
+    cd = {}
+    set_conf_files(tmpdir, cd)
+    assert cd == {
+        "proto": {"key": "value", "other_key": "othervalue", "overwritten": True}
+    }
+
+
+def test_apply(clean_conf):
+    conf["file"] = {"auto_mkdir": "test"}
+    fs = fsspec.filesystem("file")
+    assert fs.auto_mkdir == "test"
+    fs = fsspec.filesystem("file", auto_mkdir=True)
+    assert fs.auto_mkdir is True
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_core.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_core.py
new file mode 100644
index 0000000000000000000000000000000000000000..8626ae8fd9bba7a58cad8845f656977482444df7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_core.py
@@ -0,0 +1,466 @@
+import os
+import pickle
+import tempfile
+import zipfile
+from contextlib import contextmanager
+from pathlib import Path
+
+import pytest
+
+import fsspec
+from fsspec.core import (
+    OpenFile,
+    OpenFiles,
+    _expand_paths,
+    expand_paths_if_needed,
+    get_compression,
+    get_fs_token_paths,
+    open_files,
+    open_local,
+)
+
+
+@contextmanager
+def tempzip(data={}):
+    f = tempfile.mkstemp(suffix="zip")[1]
+    with zipfile.ZipFile(f, mode="w") as z:
+        for k, v in data.items():
+            z.writestr(k, v)
+    try:
+        yield f
+    finally:
+        try:
+            os.remove(f)
+        except OSError:
+            pass
+
+
+@pytest.mark.parametrize(
+    "path, name_function, num, out",
+    [
+        [["apath"], None, 1, ["apath"]],
+        ["apath.*.csv", None, 1, ["apath.0.csv"]],
+        ["apath.*.csv", None, 2, ["apath.0.csv", "apath.1.csv"]],
+        ["a*", lambda x: "abc"[x], 2, ["aa", "ab"]],
+    ],
+)
+def test_expand_paths(path, name_function, num, out):
+    assert _expand_paths(path, name_function, num) == out
+
+
+@pytest.mark.parametrize(
+    "create_files, path, out",
+    [
+        [["apath"], "apath", ["apath"]],
+        [["apath1"], "apath*", ["apath1"]],
+        [["apath1", "apath2"], "apath*", ["apath1", "apath2"]],
+        [["apath1", "apath2"], "apath[1]", ["apath1"]],
+        [["apath1", "apath11"], "apath?", ["apath1"]],
+    ],
+)
+def test_expand_paths_if_needed_in_read_mode(create_files, path, out):
+    d = str(tempfile.mkdtemp())
+    for f in create_files:
+        f = os.path.join(d, f)
+        open(f, "w").write("test")
+
+    path = os.path.join(d, path)
+
+    fs = fsspec.filesystem("file")
+    res = expand_paths_if_needed([path], "r", 0, fs, None)
+    assert [os.path.basename(p) for p in res] == out
+
+
+def test_expand_error():
+    with pytest.raises(ValueError):
+        _expand_paths("*.*", None, 1)
+
+
+@pytest.mark.parametrize("mode", ["w", "w+", "x", "x+"])
+def test_expand_fs_token_paths(mode):
+    assert len(get_fs_token_paths("path", mode, num=2, expand=True)[-1]) == 2
+
+
+def test_openfile_api(m):
+    m.open("somepath", "wb").write(b"data")
+    of = OpenFile(m, "somepath")
+    assert str(of) == "<OpenFile 'somepath'>"
+    f = of.open()
+    assert f.read() == b"data"
+    f.close()
+    with OpenFile(m, "somepath", mode="rt") as f:
+        assert f.read() == "data"
+
+
+def test_openfile_open(m):
+    of = OpenFile(m, "somepath", mode="wt")
+    f = of.open()
+    f.write("hello")
+    assert m.size("somepath") == 0  # no flush yet
+    of.close()
+    assert m.size("somepath") == 5
+
+
+def test_open_local_w_cache():
+    d1 = str(tempfile.mkdtemp())
+    f1 = os.path.join(d1, "f1")
+    open(f1, "w").write("test1")
+    d2 = str(tempfile.mkdtemp())
+    fn = open_local(f"simplecache://{f1}", cache_storage=d2, target_protocol="file")
+    assert isinstance(fn, str)
+    assert open(fn).read() == "test1"
+    assert d2 in fn
+
+
+def test_open_local_w_magic():
+    d1 = str(tempfile.mkdtemp())
+    f1 = os.path.join(d1, "f1")
+    open(f1, "w").write("test1")
+    fn = open_local(os.path.join(d1, "f*"))
+    assert len(fn) == 1
+    assert isinstance(fn, list)
+
+
+def test_open_local_w_list_of_str():
+    d1 = str(tempfile.mkdtemp())
+    f1 = os.path.join(d1, "f1")
+    open(f1, "w").write("test1")
+    fn = open_local([f1, f1])
+    assert len(fn) == 2
+    assert isinstance(fn, list)
+    assert all(isinstance(elem, str) for elem in fn)
+
+
+def test_open_local_w_path():
+    d1 = str(tempfile.mkdtemp())
+    f1 = os.path.join(d1, "f1")
+    open(f1, "w").write("test1")
+    p = Path(f1)
+    fn = open_local(p)
+    assert isinstance(fn, str)
+
+
+def test_open_local_w_list_of_path():
+    d1 = str(tempfile.mkdtemp())
+    f1 = os.path.join(d1, "f1")
+    open(f1, "w").write("test1")
+    p = Path(f1)
+    fn = open_local([p, p])
+    assert len(fn) == 2
+    assert isinstance(fn, list)
+    assert all(isinstance(elem, str) for elem in fn)
+
+
+def test_xz_lzma_compressions():
+    pytest.importorskip("lzma")
+    # Ensure that both 'xz' and 'lzma' compression names can be parsed
+    assert get_compression("some_file.xz", "infer") == "xz"
+    assert get_compression("some_file.xz", "xz") == "xz"
+    assert get_compression("some_file.xz", "lzma") == "lzma"
+
+
+def test_list():
+    here = os.path.abspath(os.path.dirname(__file__))
+    flist = os.listdir(here)
+    plist = [os.path.join(here, p).replace("\\", "/") for p in flist]
+    of = open_files(plist)
+    assert len(of) == len(flist)
+    assert [f.path for f in of] == plist
+
+
+def test_open_expand(m, monkeypatch):
+    m.pipe("/myfile", b"hello")
+    with pytest.raises(FileNotFoundError, match="expand=True"):
+        with fsspec.open("memory://my*", expand=False):
+            pass
+    with fsspec.open("memory://my*", expand=True) as f:
+        assert f.path == "/myfile"
+    monkeypatch.setattr(fsspec.core, "DEFAULT_EXPAND", True)
+    with fsspec.open("memory://my*") as f:
+        assert f.path == "/myfile"
+
+
+def test_pathobject(tmpdir):
+    import pathlib
+
+    tmpdir = str(tmpdir)
+    plist_str = [os.path.join(str(tmpdir), f).replace("\\", "/") for f in ["a", "b"]]
+    open(plist_str[0], "w").write("first file")
+    open(plist_str[1], "w").write("second file")
+    plist = [pathlib.Path(p) for p in plist_str]
+    of = open_files(plist)
+    assert len(of) == 2
+    assert [f.path for f in of] == plist_str
+
+    of = open_files(plist[0])
+    assert len(of) == 1
+    assert of[0].path == plist_str[0]
+    with of[0] as f:
+        assert f.read() == open(plist_str[0], "rb").read()
+
+
+def test_automkdir(tmpdir):
+    dir = os.path.join(str(tmpdir), "a")
+    of = fsspec.open(os.path.join(dir, "afile"), "w", auto_mkdir=False)
+    with pytest.raises(IOError):
+        with of:
+            pass
+
+    dir = os.path.join(str(tmpdir), "b")
+    of = fsspec.open(os.path.join(dir, "bfile"), "w", auto_mkdir=True)
+    with of:
+        pass
+
+    assert "bfile" in os.listdir(dir)
+
+    dir = os.path.join(str(tmpdir), "c")
+    with pytest.raises(FileNotFoundError):
+        of = fsspec.open(os.path.join(dir, "bfile"), "w", auto_mkdir=False)
+        with of:
+            pass
+
+
+def test_automkdir_readonly(tmpdir):
+    dir = os.path.join(str(tmpdir), "d")
+    with pytest.raises(FileNotFoundError):
+        of = fsspec.open(os.path.join(dir, "dfile"), "r")
+        with of:
+            pass
+
+
+def test_openfile_pickle_newline():
+    # GH#318
+    test = fsspec.open(__file__, newline=b"")
+
+    pickled = pickle.dumps(test)
+    restored = pickle.loads(pickled)
+
+    assert test.newline == restored.newline
+
+
+def test_pickle_after_open_open():
+    of = fsspec.open(__file__, mode="rt")
+    test = of.open()
+    of2 = pickle.loads(pickle.dumps(of))
+    test2 = of2.open()
+    test.close()
+
+    assert not test2.closed
+    of.close()
+    of2.close()
+
+
+# Define a list of special glob characters.
+# Note that we need to escape some characters and also consider file system limitations.
+# '*' and '?' are excluded because they are not valid for many file systems.
+# Similarly, we're careful with '{', '}', and '@' as their special meaning is
+# context-specific and might not be considered special for filenames.
+# Add tests for more file systems and for more glob magic later
+glob_magic_characters = ["[", "]", "!"]
+if os.name != "nt":
+    glob_magic_characters.extend(("*", "?"))  # not valid on Windows
+
+
+@pytest.mark.parametrize("char", glob_magic_characters)
+def test_open_file_read_with_special_characters(tmp_path, char):
+    # Create a filename incorporating the special character
+    file_name = f"test{char}.txt"
+    file_path = tmp_path / file_name
+    expected_content = "Hello, world!"
+
+    with open(file_path, "w") as f:
+        f.write(expected_content)
+
+    with fsspec.open(file_path, "r") as f:
+        actual_content = f.read()
+
+    assert actual_content == expected_content
+
+
+@pytest.mark.parametrize("char", glob_magic_characters)
+def test_open_files_read_with_special_characters(tmp_path, char):
+    # Create a filename incorporating the special character
+    file_name = f"test{char}.txt"
+    file_path = tmp_path / file_name
+    expected_content = "Hello, world!"
+
+    with open(file_path, "w") as f:
+        f.write(expected_content)
+
+    with fsspec.open_files(file_path, "r")[0] as f:
+        actual_content = f.read()
+
+    assert actual_content == expected_content
+
+
+@pytest.mark.parametrize("char", glob_magic_characters)
+def test_open_file_write_with_special_characters(tmp_path, char, monkeypatch):
+    # Create a filename incorporating the special character
+    file_name = f"test{char}.txt"
+    file_path = tmp_path / file_name
+    expected_content = "Hello, world!"
+
+    with fsspec.open(file_path, "w", expand=False) as f:
+        f.write(expected_content)
+
+    with open(file_path, "r") as f:
+        actual_content = f.read()
+
+    monkeypatch.setattr(fsspec.core, "DEFAULT_EXPAND", False)
+    with fsspec.open(file_path, "w") as f:
+        f.write(expected_content * 2)
+
+    with open(file_path, "r") as f:
+        f.read() == actual_content * 2
+
+    assert actual_content == expected_content
+
+
+@pytest.mark.parametrize("char", glob_magic_characters)
+def test_open_files_read_with_special_characters(tmp_path, char):
+    # Create a filename incorporating the special character
+    file_name = f"test{char}.txt"
+    file_path = tmp_path / file_name
+    expected_content = "Hello, world!"
+
+    with open(file_path, "w") as f:
+        f.write(expected_content)
+
+    with fsspec.open_files(
+        urlpath=[os.fspath(file_path)], mode="r", auto_mkdir=False, expand=False
+    )[0] as f:
+        actual_content = f.read()
+
+    assert actual_content == expected_content
+
+
+@pytest.mark.parametrize("char", glob_magic_characters)
+def test_open_files_write_with_special_characters(tmp_path, char):
+    # Create a filename incorporating the special character
+    file_name = f"test{char}.txt"
+    file_path = tmp_path / file_name
+    expected_content = "Hello, world!"
+
+    with fsspec.open_files(
+        urlpath=[os.fspath(file_path)], mode="w", auto_mkdir=False, expand=False
+    )[0] as f:
+        f.write(expected_content)
+
+    with open(file_path, "r") as f:
+        actual_content = f.read()
+
+    assert actual_content == expected_content
+
+
+def test_mismatch():
+    pytest.importorskip("s3fs")
+    with pytest.raises(ValueError):
+        open_files(["s3://test/path.csv", "/other/path.csv"])
+
+
+def test_url_kwargs_chain(ftp_writable):
+    host, port, username, password = ftp_writable
+    data = b"hello"
+    with fsspec.open(
+        "ftp:///afile", "wb", host=host, port=port, username=username, password=password
+    ) as f:
+        f.write(data)
+
+    with fsspec.open(
+        f"simplecache::ftp://{username}:{password}@{host}:{port}//afile", "rb"
+    ) as f:
+        assert f.read() == data
+
+
+def test_multi_context(tmpdir):
+    fns = [os.path.join(tmpdir, fn) for fn in ["a", "b"]]
+    files = open_files(fns, "wb")
+    assert isinstance(files, OpenFiles)
+    assert isinstance(files[0], OpenFile)
+    assert len(files) == 2
+    assert isinstance(files[:1], OpenFiles)
+    assert len(files[:1]) == 1
+    with files as of:
+        assert len(of) == 2
+        assert not of[0].closed
+        assert of[0].name.endswith("a")
+    assert of[0].closed
+    assert repr(files) == "<List of 2 OpenFile instances>"
+
+
+def test_not_local():
+    with pytest.raises(ValueError, match="attribute local_file=True"):
+        open_local("memory://afile")
+
+
+def test_url_to_fs(ftp_writable):
+    host, port, username, password = ftp_writable
+    data = b"hello"
+    with fsspec.open(f"ftp://{username}:{password}@{host}:{port}/afile", "wb") as f:
+        f.write(data)
+    fs, url = fsspec.core.url_to_fs(
+        f"simplecache::ftp://{username}:{password}@{host}:{port}/afile"
+    )
+    assert url == "/afile"
+    fs, url = fsspec.core.url_to_fs(f"ftp://{username}:{password}@{host}:{port}/afile")
+    assert url == "/afile"
+
+    with fsspec.open(f"ftp://{username}:{password}@{host}:{port}/afile.zip", "wb") as f:
+        import zipfile
+
+        with zipfile.ZipFile(f, "w") as z:
+            with z.open("inner", "w") as f2:
+                f2.write(b"hello")
+        f.write(data)
+
+    fs, url = fsspec.core.url_to_fs(
+        f"zip://inner::ftp://{username}:{password}@{host}:{port}/afile.zip"
+    )
+    assert url == "inner"
+    fs, url = fsspec.core.url_to_fs(
+        f"simplecache::zip::ftp://{username}:{password}@{host}:{port}/afile.zip"
+    )
+    assert url == ""
+
+
+def test_target_protocol_options(ftp_writable):
+    host, port, username, password = ftp_writable
+    data = {"afile": b"hello"}
+    options = {"host": host, "port": port, "username": username, "password": password}
+    with tempzip(data) as lfile, fsspec.open(
+        "ftp:///archive.zip", "wb", **options
+    ) as f:
+        f.write(open(lfile, "rb").read())
+    with fsspec.open(
+        "zip://afile",
+        "rb",
+        target_protocol="ftp",
+        target_options=options,
+        fo="archive.zip",
+    ) as f:
+        assert f.read() == data["afile"]
+
+
+def test_chained_url(ftp_writable):
+    host, port, username, password = ftp_writable
+    data = {"afile": b"hello"}
+    cls = fsspec.get_filesystem_class("ftp")
+    fs = cls(host=host, port=port, username=username, password=password)
+    with tempzip(data) as lfile:
+        fs.put_file(lfile, "archive.zip")
+
+    urls = [
+        "zip://afile",
+        "zip://afile::simplecache",
+        "simplecache::zip://afile",
+        "simplecache::zip://afile::simplecache",
+    ]
+    for url in urls:
+        url += f"::ftp://{username}:{password}@{host}:{port}/archive.zip"
+        with fsspec.open(url, "rb") as f:
+            assert f.read() == data["afile"]
+
+
+def test_automkdir_local():
+    fs, _ = fsspec.core.url_to_fs("file://", auto_mkdir=True)
+    assert fs.auto_mkdir is True
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_downstream.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_downstream.py
new file mode 100644
index 0000000000000000000000000000000000000000..172b2a7a73e8a4eef07776bfe25cc410c4495577
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_downstream.py
@@ -0,0 +1,40 @@
+import pytest
+
+pytest.importorskip("s3fs")
+pytest.importorskip("moto")
+
+try:
+    from s3fs.tests.test_s3fs import (  # noqa: E402,F401
+        endpoint_uri,
+        s3,
+        s3_base,
+        test_bucket_name,
+    )
+except ImportError:
+    pytest.skip("s3 tests not available.")
+
+so = {"anon": False, "client_kwargs": {"endpoint_url": endpoint_uri}}
+
+
+def test_pandas(s3):
+    pd = pytest.importorskip("pandas")
+    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+    df.to_csv(f"s3://{test_bucket_name}/a.csv", storage_options=so)
+    df2 = pd.read_csv(f"s3://{test_bucket_name}/a.csv", storage_options=so)
+
+    assert df.a.equals(df2.a)
+
+
+def test_xarray_zarr(s3):
+    xr = pytest.importorskip("xarray")
+    pytest.importorskip("zarr")
+    import numpy as np
+
+    x = np.arange(5)
+    xarr = xr.DataArray(x)
+    ds = xr.Dataset({"x": xarr})
+    ds.to_zarr(f"s3://{test_bucket_name}/a.zarr", storage_options=so)
+
+    ds2 = xr.open_zarr(f"s3://{test_bucket_name}/a.zarr", storage_options=so)
+
+    assert (ds.x == ds2.x).all()
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_file.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..784f50753f2af9fde4f16a7945a10bce8f76f1c5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_file.py
@@ -0,0 +1,200 @@
+"""Tests abstract buffered file API, using FTP implementation"""
+
+import pickle
+
+import pytest
+
+from fsspec.implementations.tests.test_ftp import FTPFileSystem
+
+data = b"hello" * 10000
+
+
+def test_pickle(ftp_writable):
+    host, port, user, pw = ftp_writable
+    ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
+
+    f = ftp.open("/out", "rb")
+
+    f2 = pickle.loads(pickle.dumps(f))
+    assert f == f2
+
+
+def test_file_read_attributes(ftp_writable):
+    host, port, user, pw = ftp_writable
+    ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
+
+    f = ftp.open("/out", "rb")
+    assert f.info()["size"] == len(data)
+    assert f.tell() == 0
+    assert f.seekable()
+    assert f.readable()
+    assert not f.writable()
+    out = bytearray(len(data))
+
+    assert f.read() == data
+    assert f.read() == b""
+    f.seek(0)
+    assert f.readuntil(b"l") == b"hel"
+    assert f.tell() == 3
+
+    f.readinto1(out)
+    assert out[:-3] == data[3:]
+    with pytest.raises(ValueError):
+        f.write(b"")
+    f.close()
+    with pytest.raises(ValueError):
+        f.read()(b"")
+
+
+def test_seek(ftp_writable):
+    host, port, user, pw = ftp_writable
+    ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
+
+    f = ftp.open("/out", "rb")
+
+    assert f.seek(-10, 2) == len(data) - 10
+    assert f.tell() == len(data) - 10
+    assert f.seek(-1, 1) == len(data) - 11
+    with pytest.raises(ValueError):
+        f.seek(-1)
+    with pytest.raises(ValueError):
+        f.seek(0, 7)
+
+
+def test_file_idempotent(ftp_writable):
+    host, port, user, pw = ftp_writable
+    ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
+
+    f = ftp.open("/out", "rb")
+    f2 = ftp.open("/out", "rb")
+    assert hash(f) == hash(f2)
+    assert f == f2
+    ftp.touch("/out2")
+    f2 = ftp.open("/out2", "rb")
+    assert hash(f2) != hash(f)
+    assert f != f2
+    f2 = ftp.open("/out", "wb")
+    assert hash(f2) != hash(f)
+
+
+def test_file_text_attributes(ftp_writable):
+    host, port, user, pw = ftp_writable
+    ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
+
+    data = b"hello\n" * 1000
+    with ftp.open("/out2", "wb") as f:
+        f.write(data)
+
+    f = ftp.open("/out2", "rb")
+    assert f.readline() == b"hello\n"
+    f.seek(0)
+    assert list(f) == [d + b"\n" for d in data.split()]
+    f.seek(0)
+    assert f.readlines() == [d + b"\n" for d in data.split()]
+
+    f = ftp.open("/out2", "rt")
+    assert f.readline() == "hello\n"
+    assert f.encoding
+
+
+def test_file_write_attributes(ftp_writable):
+    host, port, user, pw = ftp_writable
+    ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
+    f = ftp.open("/out2", "wb")
+    with pytest.raises(ValueError):
+        f.info()
+    with pytest.raises(OSError):
+        f.seek(0)
+    with pytest.raises(ValueError):
+        f.read(0)
+    assert not f.readable()
+    assert f.writable()
+
+    f.flush()  # no-op
+
+    assert f.write(b"hello") == 5
+    assert f.write(b"hello") == 5
+    assert not f.closed
+    f.close()
+    assert f.closed
+    with pytest.raises(ValueError):
+        f.write(b"")
+    with pytest.raises(ValueError):
+        f.flush()
+
+
+def test_midread_cache(ftp_writable):
+    host, port, user, pw = ftp_writable
+    fs = FTPFileSystem(host=host, port=port, username=user, password=pw)
+    fn = "/myfile"
+    with fs.open(fn, "wb") as f:
+        f.write(b"a" * 175627146)
+    with fs.open(fn, "rb") as f:
+        f.seek(175561610)
+        d1 = f.read(65536)
+        assert len(d1) == 65536
+
+        f.seek(4)
+        size = 17562198
+        d2 = f.read(size)
+        assert len(d2) == size
+
+        f.seek(17562288)
+        size = 17562187
+        d3 = f.read(size)
+        assert len(d3) == size
+
+
+def test_read_block(ftp_writable):
+    # not the same as test_read_block in test_utils, this depends on the
+    # behaviour of the bytest caching
+    from fsspec.utils import read_block
+
+    host, port, user, pw = ftp_writable
+    fs = FTPFileSystem(host=host, port=port, username=user, password=pw)
+    fn = "/myfile"
+    with fs.open(fn, "wb") as f:
+        f.write(b"a,b\n1,2")
+    f = fs.open(fn, "rb", cache_type="bytes")
+    assert read_block(f, 0, 6400, b"\n") == b"a,b\n1,2"
+
+
+def test_with_gzip(ftp_writable):
+    import gzip
+
+    data = b"some compressible stuff"
+    host, port, user, pw = ftp_writable
+    fs = FTPFileSystem(host=host, port=port, username=user, password=pw)
+    fn = "/myfile"
+    with fs.open(fn, "wb") as f:
+        gf = gzip.GzipFile(fileobj=f, mode="w")
+        gf.write(data)
+        gf.close()
+    with fs.open(fn, "rb") as f:
+        gf = gzip.GzipFile(fileobj=f, mode="r")
+        assert gf.read() == data
+
+
+def test_auto_compression(m):
+    fs = m
+    with fs.open("myfile.gz", mode="wt", compression="infer") as f:
+        f.write("text")
+    with fs.open("myfile.gz", mode="rt", compression="infer") as f:
+        assert f.read() == "text"
+
+
+def test_with_zip(ftp_writable):
+    import zipfile
+
+    data = b"hello zip"
+    host, port, user, pw = ftp_writable
+    fs = FTPFileSystem(host=host, port=port, username=user, password=pw)
+    fn = "/myfile.zip"
+    inner_file = "test.txt"
+    with fs.open(fn, "wb") as f:
+        zf = zipfile.ZipFile(f, mode="w")
+        zf.writestr(inner_file, data)
+        zf.close()
+    with fs.open(fn, "rb") as f:
+        zf = zipfile.ZipFile(f, mode="r")
+        assert zf.read(inner_file) == data
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_fuse.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_fuse.py
new file mode 100644
index 0000000000000000000000000000000000000000..db627ffc962fa58f2e7de4e335a3a08cceeed625
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_fuse.py
@@ -0,0 +1,147 @@
+import os
+import subprocess
+import time
+from multiprocessing import Process
+
+import pytest
+
+try:
+    pytest.importorskip("fuse")  # noqa: E402
+except OSError:
+    # can succeed in importing fuse, but fail to load so
+    pytest.importorskip("nonexistent")  # noqa: E402
+
+from fsspec.fuse import main, run
+from fsspec.implementations.memory import MemoryFileSystem
+
+
+def host_fuse(mountdir):
+    fs = MemoryFileSystem()
+    fs.touch("/mounted/testfile")
+    run(fs, "/mounted/", mountdir)
+
+
+def test_basic(tmpdir, capfd):
+    mountdir = str(tmpdir.mkdir("mount"))
+
+    fuse_process = Process(target=host_fuse, args=(str(mountdir),))
+    fuse_process.start()
+
+    try:
+        timeout = 10
+        while True:
+            try:
+                # can fail with device not ready while waiting for fuse
+                if "testfile" in os.listdir(mountdir):
+                    break
+            except Exception:
+                pass
+            timeout -= 1
+            time.sleep(1)
+            if not timeout > 0:
+                import pdb
+
+                pdb.set_trace()
+                pytest.skip(msg="fuse didn't come live")
+
+        fn = os.path.join(mountdir, "test")
+        with open(fn, "wb") as f:
+            f.write(b"data")
+
+        with open(fn) as f:
+            assert f.read() == "data"
+
+        os.remove(fn)
+
+        os.mkdir(fn)
+        assert os.listdir(fn) == []
+
+        os.mkdir(fn + "/inner")
+
+        with pytest.raises(OSError):
+            os.rmdir(fn)
+
+        captured = capfd.readouterr()
+        assert "Traceback" not in captured.out
+        assert "Traceback" not in captured.err
+
+        os.rmdir(fn + "/inner")
+        os.rmdir(fn)
+    finally:
+        fuse_process.terminate()
+        fuse_process.join(timeout=10)
+        if fuse_process.is_alive():
+            fuse_process.kill()
+            fuse_process.join()
+
+
+def host_mount_local(source_dir, mount_dir, debug_log):
+    main(["local", source_dir, mount_dir, "-l", debug_log, "--ready-file"])
+
+
+@pytest.fixture()
+def mount_local(tmpdir):
+    source_dir = tmpdir.mkdir("source")
+    mount_dir = tmpdir.mkdir("local")
+    debug_log = tmpdir / "debug.log"
+    fuse_process = Process(
+        target=host_mount_local, args=(str(source_dir), str(mount_dir), str(debug_log))
+    )
+    fuse_process.start()
+    ready_file = mount_dir / ".fuse_ready"
+    for _ in range(20):
+        if ready_file.exists() and open(ready_file).read() == b"ready":
+            break
+        time.sleep(0.1)
+    try:
+        yield (source_dir, mount_dir)
+    finally:
+        fuse_process.terminate()
+        fuse_process.join(timeout=10)
+        if fuse_process.is_alive():
+            fuse_process.kill()
+            fuse_process.join()
+
+
+def test_mount(mount_local):
+    source_dir, mount_dir = mount_local
+    assert os.listdir(mount_dir) == []
+    assert os.listdir(source_dir) == []
+
+    mount_dir.mkdir("a")
+
+    assert os.listdir(mount_dir) == ["a"]
+    assert os.listdir(source_dir) == ["a"]
+
+
+def test_chmod(mount_local):
+    source_dir, mount_dir = mount_local
+    open(mount_dir / "text", "w").write("test")
+    assert os.listdir(source_dir) == ["text"]
+
+    cp = subprocess.run(
+        ["cp", str(mount_dir / "text"), str(mount_dir / "new")],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        check=False,
+    )
+
+    assert cp.stderr == b""
+    assert cp.stdout == b""
+    assert set(os.listdir(source_dir)) == {"text", "new"}
+    assert open(mount_dir / "new").read() == "test"
+
+
+def test_seek_rw(mount_local):
+    source_dir, mount_dir = mount_local
+    fh = open(mount_dir / "text", "w")
+    fh.write("teST")
+    fh.seek(2)
+    fh.write("st")
+    fh.close()
+
+    fh = open(mount_dir / "text", "r")
+    assert fh.read() == "test"
+    fh.seek(2)
+    assert fh.read() == "st"
+    fh.close()
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_generic.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_generic.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc4c8bf01754c01e56394d8331090e8dc33150fe
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_generic.py
@@ -0,0 +1,90 @@
+import pytest
+
+import fsspec
+from fsspec.tests.conftest import data, server  # noqa: F401
+
+
+def test_remote_async_ops(server):
+    fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"})
+    fs = fsspec.filesystem("generic", default_method="current")
+    out = fs.info(server + "/index/realfile")
+    assert out["size"] == len(data)
+    assert out["type"] == "file"
+    assert fs.isfile(server + "/index/realfile")  # this method from superclass
+
+
+def test_touch_rm(m):
+    m.touch("afile")
+    m.touch("dir/afile")
+
+    fs = fsspec.filesystem("generic", default_method="current")
+    fs.rm("memory://afile")
+    assert not m.exists("afile")
+
+    fs.rm("memory://dir", recursive=True)
+    assert not m.exists("dir/afile")
+    assert not m.exists("dir")
+
+
+def test_cp_async_to_sync(server, m):
+    fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"})
+    fs = fsspec.filesystem("generic", default_method="current")
+    fs.cp([server + "/index/realfile"], ["memory://realfile"])
+    assert m.cat("realfile") == data
+
+    fs.rm("memory://realfile")
+    assert not m.exists("realfile")
+
+
+def test_pipe_cat_sync(m):
+    fs = fsspec.filesystem("generic", default_method="current")
+    fs.pipe("memory://afile", b"data")
+    assert fs.cat("memory://afile") == b"data"
+
+
+def test_cat_async(server):
+    fsspec.filesystem("http", headers={"give_length": "true", "head_ok": "true"})
+    fs = fsspec.filesystem("generic", default_method="current")
+    assert fs.cat(server + "/index/realfile") == data
+
+
+def test_rsync(tmpdir, m):
+    from fsspec.generic import GenericFileSystem, rsync
+
+    fs = GenericFileSystem()
+    fs.pipe("memory:///deep/path/afile", b"data1")
+    fs.pipe("memory:///deep/afile", b"data2")
+
+    with pytest.raises(ValueError):
+        rsync("memory:///deep/afile", f"file://{tmpdir}")
+    rsync("memory://", f"file://{tmpdir}")
+
+    allfiles = fs.find(f"file://{tmpdir}", withdirs=True, detail=True)
+    pos_tmpdir = fsspec.implementations.local.make_path_posix(str(tmpdir))  # for WIN
+    assert set(allfiles) == {
+        f"file://{pos_tmpdir}{_}"
+        for _ in [
+            "",
+            "/deep",
+            "/deep/path",
+            "/deep/path/afile",
+            "/deep/afile",
+        ]
+    }
+    fs.rm("memory:///deep/afile")
+    rsync("memory://", f"file://{tmpdir}", delete_missing=True)
+    allfiles2 = fs.find(f"file://{tmpdir}", withdirs=True, detail=True)
+    assert set(allfiles2) == {
+        f"file://{pos_tmpdir}{_}"
+        for _ in [
+            "",
+            "/deep",
+            "/deep/path",
+            "/deep/path/afile",
+        ]
+    }
+    # the file was not updated, since size was correct
+    assert (
+        allfiles[f"file://{pos_tmpdir}/deep/path/afile"]
+        == allfiles2[f"file://{pos_tmpdir}/deep/path/afile"]
+    )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_gui.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_gui.py
new file mode 100644
index 0000000000000000000000000000000000000000..38b83a2cfb49517e387091441d77f80e1ffa1aec
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_gui.py
@@ -0,0 +1,23 @@
+import pytest
+
+panel = pytest.importorskip("panel")
+
+
+def test_basic():
+    import fsspec.gui
+
+    gui = fsspec.gui.FileSelector()
+    assert "url" in str(gui.panel)
+
+
+def test_kwargs(tmpdir):
+    """confirm kwargs are passed to the filesystem instance"""
+    import fsspec.gui
+
+    gui = fsspec.gui.FileSelector(f"file://{tmpdir}", kwargs="{'auto_mkdir': True}")
+
+    assert gui.fs.auto_mkdir
+
+    gui = fsspec.gui.FileSelector(f"file://{tmpdir}", kwargs={"auto_mkdir": True})
+
+    assert gui.fs.auto_mkdir
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_mapping.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_mapping.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d6c8fdde5a775dd3078461c61ef76948f0deb5e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_mapping.py
@@ -0,0 +1,228 @@
+import os
+import pickle
+import platform
+import sys
+import uuid
+
+import pytest
+
+import fsspec
+from fsspec.implementations.local import LocalFileSystem
+from fsspec.implementations.memory import MemoryFileSystem
+
+
+def test_mapping_prefix(tmpdir):
+    tmpdir = str(tmpdir)
+    os.makedirs(os.path.join(tmpdir, "afolder"))
+    open(os.path.join(tmpdir, "afile"), "w").write("test")
+    open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2")
+
+    m = fsspec.get_mapper(f"file://{tmpdir}")
+    assert "afile" in m
+    assert m["afolder/anotherfile"] == b"test2"
+
+    fs = fsspec.filesystem("file")
+    m2 = fs.get_mapper(tmpdir)
+    m3 = fs.get_mapper(f"file://{tmpdir}")
+
+    assert m == m2 == m3
+
+
+def test_getitems_errors(tmpdir):
+    tmpdir = str(tmpdir)
+    os.makedirs(os.path.join(tmpdir, "afolder"))
+    open(os.path.join(tmpdir, "afile"), "w").write("test")
+    open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2")
+    m = fsspec.get_mapper(f"file://{tmpdir}")
+    assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"}
+    with pytest.raises(KeyError):
+        m.getitems(["afile", "bfile"])
+    out = m.getitems(["afile", "bfile"], on_error="return")
+    assert isinstance(out["bfile"], KeyError)
+    m = fsspec.get_mapper(f"file://{tmpdir}", missing_exceptions=())
+    assert m.getitems(["afile", "bfile"], on_error="omit") == {"afile": b"test"}
+    with pytest.raises(FileNotFoundError):
+        m.getitems(["afile", "bfile"])
+
+
+def test_ops():
+    MemoryFileSystem.store.clear()
+    m = fsspec.get_mapper("memory://")
+    assert not m
+    assert list(m) == []
+
+    with pytest.raises(KeyError):
+        m["hi"]
+
+    assert m.pop("key", 0) == 0
+
+    m["key0"] = b"data"
+    assert list(m) == ["key0"]
+    assert m["key0"] == b"data"
+
+    m.clear()
+
+    assert list(m) == []
+
+
+def test_pickle():
+    m = fsspec.get_mapper("memory://")
+    assert isinstance(m.fs, MemoryFileSystem)
+    m["key"] = b"data"
+    m2 = pickle.loads(pickle.dumps(m))
+    assert list(m) == list(m2)
+    assert m.missing_exceptions == m2.missing_exceptions
+
+
+def test_keys_view():
+    # https://github.com/fsspec/filesystem_spec/issues/186
+    m = fsspec.get_mapper("memory://")
+    m["key"] = b"data"
+
+    keys = m.keys()
+    assert len(keys) == 1
+    # check that we don't consume the keys
+    assert len(keys) == 1
+    m.clear()
+
+
+def test_multi():
+    m = fsspec.get_mapper("memory:///")
+    data = {"a": b"data1", "b": b"data2"}
+    m.setitems(data)
+
+    assert m.getitems(list(data)) == data
+    m.delitems(list(data))
+    assert not list(m)
+
+
+def test_setitem_types():
+    import array
+
+    m = fsspec.get_mapper("memory://")
+    m["a"] = array.array("i", [1])
+    if sys.byteorder == "little":
+        assert m["a"] == b"\x01\x00\x00\x00"
+    else:
+        assert m["a"] == b"\x00\x00\x00\x01"
+    m["b"] = bytearray(b"123")
+    assert m["b"] == b"123"
+    m.setitems({"c": array.array("i", [1]), "d": bytearray(b"123")})
+    if sys.byteorder == "little":
+        assert m["c"] == b"\x01\x00\x00\x00"
+    else:
+        assert m["c"] == b"\x00\x00\x00\x01"
+    assert m["d"] == b"123"
+
+
+def test_setitem_numpy():
+    m = fsspec.get_mapper("memory://")
+    np = pytest.importorskip("numpy")
+    m["c"] = np.array(1, dtype="<i4")  # scalar
+    assert m["c"] == b"\x01\x00\x00\x00"
+    m["c"] = np.array([1, 2], dtype="<i4")  # array
+    assert m["c"] == b"\x01\x00\x00\x00\x02\x00\x00\x00"
+    m["c"] = np.array(
+        np.datetime64("2000-01-01T23:59:59.999999999"), dtype="<M8[ns]"
+    )  # datetime64 scalar
+    assert m["c"] == b"\xff\xff\x91\xe3c\x9b#\r"
+    m["c"] = np.array(
+        [
+            np.datetime64("1900-01-01T23:59:59.999999999"),
+            np.datetime64("2000-01-01T23:59:59.999999999"),
+        ],
+        dtype="<M8[ns]",
+    )  # datetime64 array
+    assert m["c"] == b"\xff\xff}p\xf8fX\xe1\xff\xff\x91\xe3c\x9b#\r"
+    m["c"] = np.array(
+        np.timedelta64(3155673612345678901, "ns"), dtype="<m8[ns]"
+    )  # timedelta64 scalar
+    assert m["c"] == b"5\x1c\xf0Rn4\xcb+"
+    m["c"] = np.array(
+        [
+            np.timedelta64(450810516049382700, "ns"),
+            np.timedelta64(3155673612345678901, "ns"),
+        ],
+        dtype="<m8[ns]",
+    )  # timedelta64 scalar
+    assert m["c"] == b',M"\x9e\xc6\x99A\x065\x1c\xf0Rn4\xcb+'
+
+
+def test_empty_url():
+    m = fsspec.get_mapper()
+    assert isinstance(m.fs, LocalFileSystem)
+
+
+def test_fsmap_access_with_root_prefix(tmp_path):
+    # "/a" and "a" are the same for LocalFileSystem
+    tmp_path.joinpath("a").write_bytes(b"data")
+    m = fsspec.get_mapper(f"file://{tmp_path}")
+    assert m["/a"] == m["a"] == b"data"
+
+    # "/a" and "a" differ for MemoryFileSystem
+    m = fsspec.get_mapper(f"memory://{uuid.uuid4()}")
+    m["/a"] = b"data"
+
+    assert m["/a"] == b"data"
+    with pytest.raises(KeyError):
+        _ = m["a"]
+
+
+@pytest.mark.parametrize(
+    "key",
+    [
+        pytest.param(b"k", id="bytes"),
+        pytest.param(1234, id="int"),
+        pytest.param((1,), id="tuple"),
+        pytest.param([""], id="list"),
+    ],
+)
+def test_fsmap_non_str_keys(key):
+    m = fsspec.get_mapper()
+
+    # Once the deprecation period passes
+    # FSMap.__getitem__ should raise TypeError for non-str keys
+    #   with pytest.raises(TypeError):
+    #       _ = m[key]
+
+    with pytest.warns(DeprecationWarning):
+        with pytest.raises(KeyError):
+            _ = m[key]
+
+
+def test_fsmap_error_on_protocol_keys():
+    root = uuid.uuid4()
+    m = fsspec.get_mapper(f"memory://{root}", create=True)
+    m["a"] = b"data"
+
+    assert m["a"] == b"data"
+    with pytest.raises(KeyError):
+        _ = m[f"memory://{root}/a"]
+
+
+def test_fsmap_access_with_suffix(tmp_path):
+    tmp_path.joinpath("b").mkdir()
+    tmp_path.joinpath("b", "a").write_bytes(b"data")
+    if platform.system() == "Windows":
+        # on Windows opening a directory will raise PermissionError
+        # see: https://bugs.python.org/issue43095
+        missing_exceptions = (
+            FileNotFoundError,
+            IsADirectoryError,
+            NotADirectoryError,
+            PermissionError,
+        )
+    else:
+        missing_exceptions = None
+    m = fsspec.get_mapper(f"file://{tmp_path}", missing_exceptions=missing_exceptions)
+    with pytest.raises(KeyError):
+        _ = m["b/"]
+    assert m["b/a/"] == b"data"
+
+
+def test_fsmap_dirfs():
+    m = fsspec.get_mapper("memory://")
+
+    fs = m.dirfs
+    assert isinstance(fs, fsspec.implementations.dirfs.DirFileSystem)
+    assert fs.path == m.root
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_parquet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_parquet.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f38db7c66509531409c10049be49e84485550f7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_parquet.py
@@ -0,0 +1,140 @@
+import os
+
+import pytest
+
+try:
+    import fastparquet
+except ImportError:
+    fastparquet = None
+try:
+    import pyarrow.parquet as pq
+except ImportError:
+    pq = None
+
+from fsspec.core import url_to_fs
+from fsspec.parquet import _get_parquet_byte_ranges, open_parquet_file
+
+# Define `engine` fixture
+FASTPARQUET_MARK = pytest.mark.skipif(not fastparquet, reason="fastparquet not found")
+PYARROW_MARK = pytest.mark.skipif(not pq, reason="pyarrow not found")
+ANY_ENGINE_MARK = pytest.mark.skipif(
+    not (fastparquet or pq),
+    reason="No parquet engine (fastparquet or pyarrow) found",
+)
+
+
+@pytest.fixture(
+    params=[
+        pytest.param("fastparquet", marks=FASTPARQUET_MARK),
+        pytest.param("pyarrow", marks=PYARROW_MARK),
+        pytest.param("auto", marks=ANY_ENGINE_MARK),
+    ]
+)
+def engine(request):
+    return request.param
+
+
+@pytest.mark.parametrize("columns", [None, ["x"], ["x", "y"], ["z"]])
+@pytest.mark.parametrize("max_gap", [0, 64])
+@pytest.mark.parametrize("max_block", [64, 256_000_000])
+@pytest.mark.parametrize("footer_sample_size", [8, 1_000])
+@pytest.mark.parametrize("range_index", [True, False])
+def test_open_parquet_file(
+    tmpdir, engine, columns, max_gap, max_block, footer_sample_size, range_index
+):
+    # Pandas required for this test
+    pd = pytest.importorskip("pandas")
+
+    # Write out a simple DataFrame
+    path = os.path.join(str(tmpdir), "test.parquet")
+    nrows = 40
+    df = pd.DataFrame(
+        {
+            "x": [i * 7 % 5 for i in range(nrows)],
+            "y": [[0, i] for i in range(nrows)],  # list
+            "z": [{"a": i, "b": "cat"} for i in range(nrows)],  # struct
+        },
+        index=pd.Index([10 * i for i in range(nrows)], name="myindex"),
+    )
+    if range_index:
+        df = df.reset_index(drop=True)
+        df.index.name = "myindex"
+    df.to_parquet(path)
+
+    # "Traditional read" (without `open_parquet_file`)
+    expect = pd.read_parquet(path, columns=columns)
+
+    # Use `_get_parquet_byte_ranges` to re-write a
+    # place-holder file with all bytes NOT required
+    # to read `columns` set to b"0". The purpose of
+    # this step is to make sure the read will fail
+    # if the correct bytes have not been accurately
+    # selected by `_get_parquet_byte_ranges`. If this
+    # test were reading from remote storage, we would
+    # not need this logic to capture errors.
+    fs = url_to_fs(path)[0]
+    data = _get_parquet_byte_ranges(
+        [path],
+        fs,
+        columns=columns,
+        engine=engine,
+        max_gap=max_gap,
+        max_block=max_block,
+        footer_sample_size=footer_sample_size,
+    )[path]
+    file_size = fs.size(path)
+    with open(path, "wb") as f:
+        f.write(b"0" * file_size)
+
+        if footer_sample_size == 8:
+            # We know 8 bytes is too small to include
+            # the footer metadata, so there should NOT
+            # be a key for the last 8 bytes of the file
+            bad_key = (file_size - 8, file_size)
+            assert bad_key not in data.keys()
+
+        for (start, stop), byte_data in data.items():
+            f.seek(start)
+            f.write(byte_data)
+
+    # Read back the modified file with `open_parquet_file`
+    with open_parquet_file(
+        path,
+        columns=columns,
+        engine=engine,
+        max_gap=max_gap,
+        max_block=max_block,
+        footer_sample_size=footer_sample_size,
+    ) as f:
+        result = pd.read_parquet(f, columns=columns)
+
+    # Check that `result` matches `expect`
+    pd.testing.assert_frame_equal(expect, result)
+
+    # Try passing metadata
+    if engine == "fastparquet":
+        # Should work fine for "fastparquet"
+        pf = fastparquet.ParquetFile(path)
+        with open_parquet_file(
+            path,
+            metadata=pf,
+            columns=columns,
+            engine=engine,
+            max_gap=max_gap,
+            max_block=max_block,
+            footer_sample_size=footer_sample_size,
+        ) as f:
+            result = pd.read_parquet(f, columns=columns)
+        pd.testing.assert_frame_equal(expect, result)
+    elif engine == "pyarrow":
+        # Should raise ValueError for "pyarrow"
+        with pytest.raises(ValueError):
+            open_parquet_file(
+                path,
+                metadata=["Not-None"],
+                columns=columns,
+                engine=engine,
+                max_gap=max_gap,
+                max_block=max_block,
+                footer_sample_size=footer_sample_size,
+            )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_registry.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..0664912a16a864072d20d9d7677703de98d2221a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_registry.py
@@ -0,0 +1,134 @@
+import sys
+from importlib.metadata import EntryPoint
+from unittest.mock import create_autospec, patch
+
+import pytest
+
+import fsspec
+from fsspec.implementations.zip import ZipFileSystem
+from fsspec.registry import (
+    _registry,
+    filesystem,
+    get_filesystem_class,
+    known_implementations,
+    register_implementation,
+    registry,
+)
+from fsspec.spec import AbstractFileSystem
+
+
+@pytest.fixture()
+def clear_registry():
+    try:
+        yield
+    finally:
+        _registry.clear()
+        known_implementations.pop("test", None)
+
+
+@pytest.fixture()
+def clean_imports():
+    try:
+        real_module = sys.modules["fsspec"]
+        del sys.modules["fsspec"]
+        yield
+    finally:
+        sys.modules["fsspec"] = real_module
+
+
+def test_registry_readonly():
+    get_filesystem_class("file")
+    assert "file" in registry
+    assert "file" in list(registry)
+    with pytest.raises(TypeError):
+        del registry["file"]
+    with pytest.raises(TypeError):
+        registry["file"] = None
+    with pytest.raises(AttributeError):
+        registry.clear()
+
+
+def test_register_cls(clear_registry):
+    with pytest.raises(ValueError):
+        get_filesystem_class("test")
+    register_implementation("test", AbstractFileSystem)
+    cls = get_filesystem_class("test")
+    assert cls is AbstractFileSystem
+
+
+def test_register_str(clear_registry):
+    with pytest.raises(ValueError):
+        get_filesystem_class("test")
+    register_implementation("test", "fsspec.AbstractFileSystem")
+    assert "test" not in registry
+    cls = get_filesystem_class("test")
+    assert cls is AbstractFileSystem
+    assert "test" in registry
+
+
+def test_register_fail(clear_registry):
+    register_implementation("test", "doesntexist.AbstractFileSystem")
+    with pytest.raises(ImportError):
+        get_filesystem_class("test")
+
+    # NOOP
+    register_implementation("test", "doesntexist.AbstractFileSystem", clobber=False)
+    with pytest.raises(ValueError):
+        register_implementation(
+            "test", "doesntexist.AbstractFileSystemm", clobber=False
+        )
+
+    # by default we do not allow clobbering
+    with pytest.raises(ValueError):
+        register_implementation("test", "doesntexist.AbstractFileSystemm")
+
+    register_implementation(
+        "test", "doesntexist.AbstractFileSystem", errtxt="hiho", clobber=True
+    )
+    with pytest.raises(ImportError) as e:
+        get_filesystem_class("test")
+    assert "hiho" in str(e.value)
+    register_implementation("test", AbstractFileSystem)
+
+    # NOOP
+    register_implementation("test", AbstractFileSystem)
+    with pytest.raises(ValueError):
+        register_implementation("test", ZipFileSystem)
+    register_implementation("test", AbstractFileSystem, clobber=True)
+    assert isinstance(fsspec.filesystem("test"), AbstractFileSystem)
+
+
+def test_entry_points_registered_on_import(clear_registry, clean_imports):
+    mock_ep = create_autospec(EntryPoint, module="fsspec.spec.AbstractFileSystem")
+    mock_ep.name = "test"  # this can't be set in the constructor...
+    mock_ep.value = "fsspec.spec.AbstractFileSystem"
+    import_location = "importlib.metadata.entry_points"
+    with patch(import_location, return_value={"fsspec.specs": [mock_ep]}):
+        assert "test" not in registry
+        import fsspec  # noqa
+
+        get_filesystem_class("test")
+        assert "test" in registry
+
+
+def test_filesystem_warning_arrow_hdfs_deprecated(clear_registry, clean_imports):
+    mock_ep = create_autospec(EntryPoint, module="fsspec.spec.AbstractFileSystem")
+    mock_ep.name = "arrow_hdfs"  # this can't be set in the constructor...
+    mock_ep.value = "fsspec.spec.AbstractFileSystem"
+    import_location = "importlib.metadata.entry_points"
+    with patch(import_location, return_value={"fsspec.specs": [mock_ep]}):
+        import fsspec  # noqa
+
+        with pytest.warns(DeprecationWarning):
+            filesystem("arrow_hdfs")
+
+
+def test_old_s3(monkeypatch):
+    from fsspec.registry import _import_class
+
+    s3fs = pytest.importorskip("s3fs")
+    monkeypatch.setattr(s3fs, "__version__", "0.4.2")
+    with pytest.warns():
+        _import_class("s3fs:S3FileSystem")
+    with pytest.warns():
+        _import_class("s3fs.S3FileSystem")
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_spec.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..de23d783df2b1c43501a8d61d048870793bb27ed
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_spec.py
@@ -0,0 +1,1167 @@
+import glob
+import json
+import os
+import pickle
+import subprocess
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+import fsspec
+from fsspec.implementations.ftp import FTPFileSystem
+from fsspec.implementations.http import HTTPFileSystem
+from fsspec.implementations.local import LocalFileSystem
+from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
+
+PATHS_FOR_GLOB_TESTS = (
+    {"name": "test0.json", "type": "file", "size": 100},
+    {"name": "test0.yaml", "type": "file", "size": 100},
+    {"name": "test0", "type": "directory", "size": 0},
+    {"name": "test0/test0.json", "type": "file", "size": 100},
+    {"name": "test0/test0.yaml", "type": "file", "size": 100},
+    {"name": "test0/test1", "type": "directory", "size": 0},
+    {"name": "test0/test1/test0.json", "type": "file", "size": 100},
+    {"name": "test0/test1/test0.yaml", "type": "file", "size": 100},
+    {"name": "test0/test1/test2", "type": "directory", "size": 0},
+    {"name": "test0/test1/test2/test0.json", "type": "file", "size": 100},
+    {"name": "test0/test1/test2/test0.yaml", "type": "file", "size": 100},
+    {"name": "test0/test2", "type": "directory", "size": 0},
+    {"name": "test0/test2/test0.json", "type": "file", "size": 100},
+    {"name": "test0/test2/test0.yaml", "type": "file", "size": 100},
+    {"name": "test0/test2/test1", "type": "directory", "size": 0},
+    {"name": "test0/test2/test1/test0.json", "type": "file", "size": 100},
+    {"name": "test0/test2/test1/test0.yaml", "type": "file", "size": 100},
+    {"name": "test0/test2/test1/test3", "type": "directory", "size": 0},
+    {"name": "test0/test2/test1/test3/test0.json", "type": "file", "size": 100},
+    {"name": "test0/test2/test1/test3/test0.yaml", "type": "file", "size": 100},
+    {"name": "test1.json", "type": "file", "size": 100},
+    {"name": "test1.yaml", "type": "file", "size": 100},
+    {"name": "test1", "type": "directory", "size": 0},
+    {"name": "test1/test0.json", "type": "file", "size": 100},
+    {"name": "test1/test0.yaml", "type": "file", "size": 100},
+    {"name": "test1/test0", "type": "directory", "size": 0},
+    {"name": "test1/test0/test0.json", "type": "file", "size": 100},
+    {"name": "test1/test0/test0.yaml", "type": "file", "size": 100},
+    {"name": "special_chars", "type": "directory", "size": 0},
+    {"name": "special_chars/f\\oo.txt", "type": "file", "size": 100},
+    {"name": "special_chars/f.oo.txt", "type": "file", "size": 100},
+    {"name": "special_chars/f+oo.txt", "type": "file", "size": 100},
+    {"name": "special_chars/f(oo.txt", "type": "file", "size": 100},
+    {"name": "special_chars/f)oo.txt", "type": "file", "size": 100},
+    {"name": "special_chars/f|oo.txt", "type": "file", "size": 100},
+    {"name": "special_chars/f^oo.txt", "type": "file", "size": 100},
+    {"name": "special_chars/f$oo.txt", "type": "file", "size": 100},
+    {"name": "special_chars/f{oo.txt", "type": "file", "size": 100},
+    {"name": "special_chars/f}oo.txt", "type": "file", "size": 100},
+)
+
+GLOB_POSIX_TESTS = {
+    "argnames": ("path", "expected"),
+    "argvalues": [
+        ("nonexistent", []),
+        ("test0.json", ["test0.json"]),
+        ("test0", ["test0"]),
+        ("test0/", ["test0"]),
+        ("test1/test0.yaml", ["test1/test0.yaml"]),
+        ("test0/test[1-2]", ["test0/test1", "test0/test2"]),
+        ("test0/test[1-2]/", ["test0/test1", "test0/test2"]),
+        (
+            "test0/test[1-2]/*",
+            [
+                "test0/test1/test0.json",
+                "test0/test1/test0.yaml",
+                "test0/test1/test2",
+                "test0/test2/test0.json",
+                "test0/test2/test0.yaml",
+                "test0/test2/test1",
+            ],
+        ),
+        (
+            "test0/test[1-2]/*.[j]*",
+            ["test0/test1/test0.json", "test0/test2/test0.json"],
+        ),
+        ("special_chars/f\\oo.*", ["special_chars/f\\oo.txt"]),
+        ("special_chars/f.oo.*", ["special_chars/f.oo.txt"]),
+        ("special_chars/f+oo.*", ["special_chars/f+oo.txt"]),
+        ("special_chars/f(oo.*", ["special_chars/f(oo.txt"]),
+        ("special_chars/f)oo.*", ["special_chars/f)oo.txt"]),
+        ("special_chars/f|oo.*", ["special_chars/f|oo.txt"]),
+        ("special_chars/f^oo.*", ["special_chars/f^oo.txt"]),
+        ("special_chars/f$oo.*", ["special_chars/f$oo.txt"]),
+        ("special_chars/f{oo.*", ["special_chars/f{oo.txt"]),
+        ("special_chars/f}oo.*", ["special_chars/f}oo.txt"]),
+        (
+            "*",
+            [
+                "special_chars",
+                "test0.json",
+                "test0.yaml",
+                "test0",
+                "test1.json",
+                "test1.yaml",
+                "test1",
+            ],
+        ),
+        ("*.yaml", ["test0.yaml", "test1.yaml"]),
+        (
+            "**",
+            [
+                "special_chars",
+                "special_chars/f$oo.txt",
+                "special_chars/f(oo.txt",
+                "special_chars/f)oo.txt",
+                "special_chars/f+oo.txt",
+                "special_chars/f.oo.txt",
+                "special_chars/f\\oo.txt",
+                "special_chars/f^oo.txt",
+                "special_chars/f{oo.txt",
+                "special_chars/f|oo.txt",
+                "special_chars/f}oo.txt",
+                "test0.json",
+                "test0.yaml",
+                "test0",
+                "test0/test0.json",
+                "test0/test0.yaml",
+                "test0/test1",
+                "test0/test1/test0.json",
+                "test0/test1/test0.yaml",
+                "test0/test1/test2",
+                "test0/test1/test2/test0.json",
+                "test0/test1/test2/test0.yaml",
+                "test0/test2",
+                "test0/test2/test0.json",
+                "test0/test2/test0.yaml",
+                "test0/test2/test1",
+                "test0/test2/test1/test0.json",
+                "test0/test2/test1/test0.yaml",
+                "test0/test2/test1/test3",
+                "test0/test2/test1/test3/test0.json",
+                "test0/test2/test1/test3/test0.yaml",
+                "test1.json",
+                "test1.yaml",
+                "test1",
+                "test1/test0.json",
+                "test1/test0.yaml",
+                "test1/test0",
+                "test1/test0/test0.json",
+                "test1/test0/test0.yaml",
+            ],
+        ),
+        ("*/", ["special_chars", "test0", "test1"]),
+        (
+            "**/",
+            [
+                "special_chars",
+                "test0",
+                "test0/test1",
+                "test0/test1/test2",
+                "test0/test2",
+                "test0/test2/test1",
+                "test0/test2/test1/test3",
+                "test1",
+                "test1/test0",
+            ],
+        ),
+        ("*/*.yaml", ["test0/test0.yaml", "test1/test0.yaml"]),
+        (
+            "**/*.yaml",
+            [
+                "test0.yaml",
+                "test0/test0.yaml",
+                "test0/test1/test0.yaml",
+                "test0/test1/test2/test0.yaml",
+                "test0/test2/test0.yaml",
+                "test0/test2/test1/test0.yaml",
+                "test0/test2/test1/test3/test0.yaml",
+                "test1.yaml",
+                "test1/test0.yaml",
+                "test1/test0/test0.yaml",
+            ],
+        ),
+        (
+            "*/test1/*",
+            ["test0/test1/test0.json", "test0/test1/test0.yaml", "test0/test1/test2"],
+        ),
+        ("*/test1/*.yaml", ["test0/test1/test0.yaml"]),
+        (
+            "**/test1/*",
+            [
+                "test0/test1/test0.json",
+                "test0/test1/test0.yaml",
+                "test0/test1/test2",
+                "test0/test2/test1/test0.json",
+                "test0/test2/test1/test0.yaml",
+                "test0/test2/test1/test3",
+                "test1/test0.json",
+                "test1/test0.yaml",
+                "test1/test0",
+            ],
+        ),
+        (
+            "**/test1/*.yaml",
+            [
+                "test0/test1/test0.yaml",
+                "test0/test2/test1/test0.yaml",
+                "test1/test0.yaml",
+            ],
+        ),
+        ("*/test1/*/", ["test0/test1/test2"]),
+        (
+            "**/test1/*/",
+            ["test0/test1/test2", "test0/test2/test1/test3", "test1/test0"],
+        ),
+        (
+            "*/test1/**",
+            [
+                "test0/test1",
+                "test0/test1/test0.json",
+                "test0/test1/test0.yaml",
+                "test0/test1/test2",
+                "test0/test1/test2/test0.json",
+                "test0/test1/test2/test0.yaml",
+            ],
+        ),
+        (
+            "**/test1/**",
+            [
+                "test0/test1",
+                "test0/test1/test0.json",
+                "test0/test1/test0.yaml",
+                "test0/test1/test2",
+                "test0/test1/test2/test0.json",
+                "test0/test1/test2/test0.yaml",
+                "test0/test2/test1",
+                "test0/test2/test1/test0.json",
+                "test0/test2/test1/test0.yaml",
+                "test0/test2/test1/test3",
+                "test0/test2/test1/test3/test0.json",
+                "test0/test2/test1/test3/test0.yaml",
+                "test1",
+                "test1/test0.json",
+                "test1/test0.yaml",
+                "test1/test0",
+                "test1/test0/test0.json",
+                "test1/test0/test0.yaml",
+            ],
+        ),
+        ("*/test1/**/", ["test0/test1", "test0/test1/test2"]),
+        (
+            "**/test1/**/",
+            [
+                "test0/test1",
+                "test0/test1/test2",
+                "test0/test2/test1",
+                "test0/test2/test1/test3",
+                "test1",
+                "test1/test0",
+            ],
+        ),
+        (
+            "test0/*",
+            ["test0/test0.json", "test0/test0.yaml", "test0/test1", "test0/test2"],
+        ),
+        ("test0/*.yaml", ["test0/test0.yaml"]),
+        (
+            "test0/**",
+            [
+                "test0",
+                "test0/test0.json",
+                "test0/test0.yaml",
+                "test0/test1",
+                "test0/test1/test0.json",
+                "test0/test1/test0.yaml",
+                "test0/test1/test2",
+                "test0/test1/test2/test0.json",
+                "test0/test1/test2/test0.yaml",
+                "test0/test2",
+                "test0/test2/test0.json",
+                "test0/test2/test0.yaml",
+                "test0/test2/test1",
+                "test0/test2/test1/test0.json",
+                "test0/test2/test1/test0.yaml",
+                "test0/test2/test1/test3",
+                "test0/test2/test1/test3/test0.json",
+                "test0/test2/test1/test3/test0.yaml",
+            ],
+        ),
+        ("test0/*/", ["test0/test1", "test0/test2"]),
+        (
+            "test0/**/",
+            [
+                "test0",
+                "test0/test1",
+                "test0/test1/test2",
+                "test0/test2",
+                "test0/test2/test1",
+                "test0/test2/test1/test3",
+            ],
+        ),
+        ("test0/*/*.yaml", ["test0/test1/test0.yaml", "test0/test2/test0.yaml"]),
+        (
+            "test0/**/*.yaml",
+            [
+                "test0/test0.yaml",
+                "test0/test1/test0.yaml",
+                "test0/test1/test2/test0.yaml",
+                "test0/test2/test0.yaml",
+                "test0/test2/test1/test0.yaml",
+                "test0/test2/test1/test3/test0.yaml",
+            ],
+        ),
+        (
+            "test0/*/test1/*",
+            [
+                "test0/test2/test1/test0.json",
+                "test0/test2/test1/test0.yaml",
+                "test0/test2/test1/test3",
+            ],
+        ),
+        ("test0/*/test1/*.yaml", ["test0/test2/test1/test0.yaml"]),
+        (
+            "test0/**/test1/*",
+            [
+                "test0/test1/test0.json",
+                "test0/test1/test0.yaml",
+                "test0/test1/test2",
+                "test0/test2/test1/test0.json",
+                "test0/test2/test1/test0.yaml",
+                "test0/test2/test1/test3",
+            ],
+        ),
+        (
+            "test0/**/test1/*.yaml",
+            ["test0/test1/test0.yaml", "test0/test2/test1/test0.yaml"],
+        ),
+        ("test0/*/test1/*/", ["test0/test2/test1/test3"]),
+        ("test0/**/test1/*/", ["test0/test1/test2", "test0/test2/test1/test3"]),
+        (
+            "test0/*/test1/**",
+            [
+                "test0/test2/test1",
+                "test0/test2/test1/test0.json",
+                "test0/test2/test1/test0.yaml",
+                "test0/test2/test1/test3",
+                "test0/test2/test1/test3/test0.json",
+                "test0/test2/test1/test3/test0.yaml",
+            ],
+        ),
+        (
+            "test0/**/test1/**",
+            [
+                "test0/test1",
+                "test0/test1/test0.json",
+                "test0/test1/test0.yaml",
+                "test0/test1/test2",
+                "test0/test1/test2/test0.json",
+                "test0/test1/test2/test0.yaml",
+                "test0/test2/test1",
+                "test0/test2/test1/test0.json",
+                "test0/test2/test1/test0.yaml",
+                "test0/test2/test1/test3",
+                "test0/test2/test1/test3/test0.json",
+                "test0/test2/test1/test3/test0.yaml",
+            ],
+        ),
+        ("test0/*/test1/**/", ["test0/test2/test1", "test0/test2/test1/test3"]),
+        (
+            "test0/**/test1/**/",
+            [
+                "test0/test1",
+                "test0/test1/test2",
+                "test0/test2/test1",
+                "test0/test2/test1/test3",
+            ],
+        ),
+    ],
+}
+
+
+class DummyTestFS(AbstractFileSystem):
+    protocol = "mock"
+    _file_class = AbstractBufferedFile
+    _fs_contents = (
+        {"name": "top_level", "type": "directory"},
+        {"name": "top_level/second_level", "type": "directory"},
+        {"name": "top_level/second_level/date=2019-10-01", "type": "directory"},
+        {
+            "name": "top_level/second_level/date=2019-10-01/a.parquet",
+            "type": "file",
+            "size": 100,
+        },
+        {
+            "name": "top_level/second_level/date=2019-10-01/b.parquet",
+            "type": "file",
+            "size": 100,
+        },
+        {"name": "top_level/second_level/date=2019-10-02", "type": "directory"},
+        {
+            "name": "top_level/second_level/date=2019-10-02/a.parquet",
+            "type": "file",
+            "size": 100,
+        },
+        {"name": "top_level/second_level/date=2019-10-04", "type": "directory"},
+        {
+            "name": "top_level/second_level/date=2019-10-04/a.parquet",
+            "type": "file",
+            "size": 100,
+        },
+        {"name": "misc", "type": "directory"},
+        {"name": "misc/foo.txt", "type": "file", "size": 100},
+    )
+
+    def __init__(self, fs_content=None, **kwargs):
+        if fs_content is not None:
+            self._fs_contents = fs_content
+        super().__init__(**kwargs)
+
+    def __getitem__(self, name):
+        for item in self._fs_contents:
+            if item["name"] == name:
+                return item
+        raise IndexError(f"{name} not found!")
+
+    def ls(self, path, detail=True, refresh=True, **kwargs):
+        if kwargs.pop("strip_proto", True):
+            path = self._strip_protocol(path)
+
+        files = not refresh and self._ls_from_cache(path)
+        if not files:
+            files = [
+                file for file in self._fs_contents if path == self._parent(file["name"])
+            ]
+            files.sort(key=lambda file: file["name"])
+            self.dircache[path.rstrip("/")] = files
+
+        if detail:
+            return files
+        return [file["name"] for file in files]
+
+    @classmethod
+    def get_test_paths(cls, start_with=""):
+        """Helper to return directory and file paths with no details"""
+        all = [
+            file["name"]
+            for file in cls._fs_contents
+            if file["name"].startswith(start_with)
+        ]
+        return all
+
+    def _open(
+        self,
+        path,
+        mode="rb",
+        block_size=None,
+        autocommit=True,
+        cache_options=None,
+        **kwargs,
+    ):
+        return self._file_class(
+            self,
+            path,
+            mode,
+            block_size,
+            autocommit,
+            cache_options=cache_options,
+            **kwargs,
+        )
+
+
+@pytest.mark.parametrize(
+    ["test_paths", "recursive", "maxdepth", "expected"],
+    [
+        (
+            (
+                "top_level/second_level",
+                "top_level/sec*",
+                "top_level/sec*vel",
+                "top_level/*",
+            ),
+            True,
+            None,
+            [
+                "top_level/second_level",
+                "top_level/second_level/date=2019-10-01",
+                "top_level/second_level/date=2019-10-01/a.parquet",
+                "top_level/second_level/date=2019-10-01/b.parquet",
+                "top_level/second_level/date=2019-10-02",
+                "top_level/second_level/date=2019-10-02/a.parquet",
+                "top_level/second_level/date=2019-10-04",
+                "top_level/second_level/date=2019-10-04/a.parquet",
+            ],
+        ),
+        (
+            (
+                "top_level/second_level",
+                "top_level/sec*",
+                "top_level/sec*vel",
+                "top_level/*",
+            ),
+            False,
+            None,
+            [
+                "top_level/second_level",
+            ],
+        ),
+        (
+            ("top_level/second_level",),
+            True,
+            1,
+            [
+                "top_level/second_level",
+                "top_level/second_level/date=2019-10-01",
+                "top_level/second_level/date=2019-10-02",
+                "top_level/second_level/date=2019-10-04",
+            ],
+        ),
+        (
+            ("top_level/second_level",),
+            True,
+            2,
+            [
+                "top_level/second_level",
+                "top_level/second_level/date=2019-10-01",
+                "top_level/second_level/date=2019-10-01/a.parquet",
+                "top_level/second_level/date=2019-10-01/b.parquet",
+                "top_level/second_level/date=2019-10-02",
+                "top_level/second_level/date=2019-10-02/a.parquet",
+                "top_level/second_level/date=2019-10-04",
+                "top_level/second_level/date=2019-10-04/a.parquet",
+            ],
+        ),
+        (
+            ("top_level/*", "top_level/sec*", "top_level/sec*vel", "top_level/*"),
+            True,
+            1,
+            ["top_level/second_level"],
+        ),
+        (
+            ("top_level/*", "top_level/sec*", "top_level/sec*vel", "top_level/*"),
+            True,
+            2,
+            [
+                "top_level/second_level",
+                "top_level/second_level/date=2019-10-01",
+                "top_level/second_level/date=2019-10-02",
+                "top_level/second_level/date=2019-10-04",
+            ],
+        ),
+        (
+            ("top_level/**",),
+            False,
+            None,
+            [
+                "top_level",
+                "top_level/second_level",
+                "top_level/second_level/date=2019-10-01",
+                "top_level/second_level/date=2019-10-01/a.parquet",
+                "top_level/second_level/date=2019-10-01/b.parquet",
+                "top_level/second_level/date=2019-10-02",
+                "top_level/second_level/date=2019-10-02/a.parquet",
+                "top_level/second_level/date=2019-10-04",
+                "top_level/second_level/date=2019-10-04/a.parquet",
+            ],
+        ),
+        (
+            ("top_level/**",),
+            True,
+            None,
+            [
+                "top_level",
+                "top_level/second_level",
+                "top_level/second_level/date=2019-10-01",
+                "top_level/second_level/date=2019-10-01/a.parquet",
+                "top_level/second_level/date=2019-10-01/b.parquet",
+                "top_level/second_level/date=2019-10-02",
+                "top_level/second_level/date=2019-10-02/a.parquet",
+                "top_level/second_level/date=2019-10-04",
+                "top_level/second_level/date=2019-10-04/a.parquet",
+            ],
+        ),
+        (("top_level/**",), True, 1, ["top_level", "top_level/second_level"]),
+        (
+            ("top_level/**",),
+            True,
+            2,
+            [
+                "top_level",
+                "top_level/second_level",
+                "top_level/second_level/date=2019-10-01",
+                "top_level/second_level/date=2019-10-01/a.parquet",
+                "top_level/second_level/date=2019-10-01/b.parquet",
+                "top_level/second_level/date=2019-10-02",
+                "top_level/second_level/date=2019-10-02/a.parquet",
+                "top_level/second_level/date=2019-10-04",
+                "top_level/second_level/date=2019-10-04/a.parquet",
+            ],
+        ),
+        (
+            ("top_level/**/a.*",),
+            False,
+            None,
+            [
+                "top_level/second_level/date=2019-10-01/a.parquet",
+                "top_level/second_level/date=2019-10-02/a.parquet",
+                "top_level/second_level/date=2019-10-04/a.parquet",
+            ],
+        ),
+        (
+            ("top_level/**/a.*",),
+            True,
+            None,
+            [
+                "top_level/second_level/date=2019-10-01/a.parquet",
+                "top_level/second_level/date=2019-10-02/a.parquet",
+                "top_level/second_level/date=2019-10-04/a.parquet",
+            ],
+        ),
+        (
+            ("top_level/**/second_level/date=2019-10-02",),
+            False,
+            2,
+            [
+                "top_level/second_level/date=2019-10-02",
+            ],
+        ),
+        (
+            ("top_level/**/second_level/date=2019-10-02",),
+            True,
+            2,
+            [
+                "top_level/second_level/date=2019-10-02",
+                "top_level/second_level/date=2019-10-02/a.parquet",
+            ],
+        ),
+        [("misc/foo.txt", "misc/*.txt"), False, None, ["misc/foo.txt"]],
+        [("misc/foo.txt", "misc/*.txt"), True, None, ["misc/foo.txt"]],
+        (
+            ("",),
+            False,
+            None,
+            [DummyTestFS.root_marker],
+        ),
+        (
+            ("",),
+            True,
+            None,
+            DummyTestFS.get_test_paths() + [DummyTestFS.root_marker],
+        ),
+        [
+            (Path("misc/foo.txt"),),
+            False,
+            None,
+            [f"misc{os.sep}foo.txt"],
+        ],
+    ],
+)
+def test_expand_path(test_paths, recursive, maxdepth, expected):
+    """Test a number of paths and then their combination which should all yield
+    the same set of expanded paths"""
+    test_fs = DummyTestFS()
+
+    # test single query
+    for test_path in test_paths:
+        paths = test_fs.expand_path(test_path, recursive=recursive, maxdepth=maxdepth)
+        assert sorted(paths) == sorted(expected)
+
+    # test with all queries
+    paths = test_fs.expand_path(
+        list(test_paths), recursive=recursive, maxdepth=maxdepth
+    )
+    assert sorted(paths) == sorted(expected)
+
+
+def test_expand_paths_with_wrong_args():
+    test_fs = DummyTestFS()
+
+    with pytest.raises(ValueError):
+        test_fs.expand_path("top_level", recursive=True, maxdepth=0)
+    with pytest.raises(ValueError):
+        test_fs.expand_path("top_level", maxdepth=0)
+    with pytest.raises(FileNotFoundError):
+        test_fs.expand_path("top_level/**/second_level/date=2019-10-02", maxdepth=1)
+    with pytest.raises(FileNotFoundError):
+        test_fs.expand_path("nonexistent/*")
+
+
+@pytest.mark.xfail
+def test_find():
+    """Test .find() method on debian server (ftp, https) with constant folder"""
+    filesystem, host, test_path = (
+        FTPFileSystem,
+        "ftp.fau.de",
+        "ftp://ftp.fau.de/debian-cd/current/amd64/log/success",
+    )
+    test_fs = filesystem(host)
+    filenames_ftp = test_fs.find(test_path)
+    assert filenames_ftp
+
+    filesystem, host, test_path = (
+        HTTPFileSystem,
+        "https://ftp.fau.de",
+        "https://ftp.fau.de/debian-cd/current/amd64/log/success",
+    )
+    test_fs = filesystem()
+    filenames_http = test_fs.find(test_path)
+    roots = [f.rsplit("/", 1)[-1] for f in filenames_http]
+
+    assert all(f.rsplit("/", 1)[-1] in roots for f in filenames_ftp)
+
+
+def test_find_details():
+    test_fs = DummyTestFS()
+    filenames = test_fs.find("/")
+    details = test_fs.find("/", detail=True)
+    for filename in filenames:
+        assert details[filename] == test_fs.info(filename)
+
+
+def test_find_file():
+    test_fs = DummyTestFS()
+
+    filename = "misc/foo.txt"
+    assert test_fs.find(filename) == [filename]
+    assert test_fs.find(filename, detail=True) == {filename: {}}
+
+
+def test_cache():
+    fs = DummyTestFS()
+    fs2 = DummyTestFS()
+    assert fs is fs2
+
+    assert DummyTestFS.current() is fs
+    assert len(fs._cache) == 1
+    del fs2
+    assert len(fs._cache) == 1
+    del fs
+
+    # keeps and internal reference, doesn't get collected
+    assert len(DummyTestFS._cache) == 1
+
+    DummyTestFS.clear_instance_cache()
+    assert len(DummyTestFS._cache) == 0
+
+
+def test_current():
+    fs = DummyTestFS()
+    fs2 = DummyTestFS(arg=1)
+
+    assert fs is not fs2
+    assert DummyTestFS.current() is fs2
+
+    DummyTestFS()
+    assert DummyTestFS.current() is fs
+
+
+def test_alias():
+    with pytest.warns(FutureWarning, match="add_aliases"):
+        DummyTestFS(add_aliases=True)
+
+
+def test_add_docs_warns():
+    with pytest.warns(FutureWarning, match="add_docs"):
+        AbstractFileSystem(add_docs=True)
+
+
+def test_cache_options():
+    fs = DummyTestFS()
+    f = AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes")
+    assert f.cache.trim
+
+    # TODO: dummy buffered file
+    f = AbstractBufferedFile(
+        fs, "misc/foo.txt", cache_type="bytes", cache_options={"trim": False}
+    )
+    assert f.cache.trim is False
+
+    f = fs.open("misc/foo.txt", cache_type="bytes", cache_options={"trim": False})
+    assert f.cache.trim is False
+
+
+def test_trim_kwarg_warns():
+    fs = DummyTestFS()
+    with pytest.warns(FutureWarning, match="cache_options"):
+        AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes", trim=False)
+
+
+def tests_file_open_error(monkeypatch):
+    class InitiateError(ValueError): ...
+
+    class UploadError(ValueError): ...
+
+    class DummyBufferedFile(AbstractBufferedFile):
+        can_initiate = False
+
+        def _initiate_upload(self):
+            if not self.can_initiate:
+                raise InitiateError
+
+        def _upload_chunk(self, final=False):
+            raise UploadError
+
+    monkeypatch.setattr(DummyTestFS, "_file_class", DummyBufferedFile)
+
+    fs = DummyTestFS()
+    with pytest.raises(InitiateError):
+        with fs.open("misc/foo.txt", "wb") as stream:
+            stream.write(b"hello" * stream.blocksize * 2)
+
+    with pytest.raises(UploadError):
+        with fs.open("misc/foo.txt", "wb") as stream:
+            stream.can_initiate = True
+            stream.write(b"hello" * stream.blocksize * 2)
+
+
+def test_eq():
+    fs = DummyTestFS()
+    result = fs == 1
+    assert result is False
+
+    f = AbstractBufferedFile(fs, "misc/foo.txt", cache_type="bytes")
+    result = f == 1
+    assert result is False
+
+
+def test_pickle_multiple():
+    a = DummyTestFS(1)
+    b = DummyTestFS(2, bar=1)
+
+    x = pickle.dumps(a)
+    y = pickle.dumps(b)
+
+    del a, b
+    DummyTestFS.clear_instance_cache()
+
+    result = pickle.loads(x)
+    assert result.storage_args == (1,)
+    assert result.storage_options == {}
+
+    result = pickle.loads(y)
+    assert result.storage_args == (2,)
+    assert result.storage_options == {"bar": 1}
+
+
+def test_json():
+    a = DummyTestFS(1)
+    b = DummyTestFS(2, bar=1)
+
+    outa = a.to_json()
+    outb = b.to_json()
+
+    assert json.loads(outb)  # is valid JSON
+    assert a != b
+    assert "bar" in outb
+
+    assert DummyTestFS.from_json(outa) is a
+    assert DummyTestFS.from_json(outb) is b
+
+
+def test_ls_from_cache():
+    fs = DummyTestFS()
+    uncached_results = fs.ls("top_level/second_level/", refresh=True)
+
+    assert fs.ls("top_level/second_level/", refresh=False) == uncached_results
+
+    # _strip_protocol removes everything by default though
+    # for the sake of testing the _ls_from_cache interface
+    # directly, we need run one time more without that call
+    # to actually verify that our stripping in the client
+    # function works.
+    assert (
+        fs.ls("top_level/second_level/", refresh=False, strip_proto=False)
+        == uncached_results
+    )
+
+
+@pytest.mark.parametrize(
+    "dt",
+    [
+        np.int8,
+        np.int16,
+        np.int32,
+        np.int64,
+        np.uint8,
+        np.uint16,
+        np.uint32,
+        np.uint64,
+        np.float32,
+        np.float64,
+    ],
+)
+def test_readinto_with_numpy(tmpdir, dt):
+    store_path = str(tmpdir / "test_arr.npy")
+    arr = np.arange(10, dtype=dt)
+    arr.tofile(store_path)
+
+    arr2 = np.empty_like(arr)
+    with fsspec.open(store_path, "rb") as f:
+        f.readinto(arr2)
+
+    assert np.array_equal(arr, arr2)
+
+
+@pytest.mark.parametrize(
+    "dt",
+    [
+        np.int8,
+        np.int16,
+        np.int32,
+        np.int64,
+        np.uint8,
+        np.uint16,
+        np.uint32,
+        np.uint64,
+        np.float32,
+        np.float64,
+    ],
+)
+def test_readinto_with_multibyte(ftp_writable, tmpdir, dt):
+    host, port, user, pw = ftp_writable
+    ftp = FTPFileSystem(host=host, port=port, username=user, password=pw)
+
+    with ftp.open("/out", "wb") as fp:
+        arr = np.arange(10, dtype=dt)
+        fp.write(arr.tobytes())
+
+    with ftp.open("/out", "rb") as fp:
+        arr2 = np.empty_like(arr)
+        fp.readinto(arr2)
+
+    assert np.array_equal(arr, arr2)
+
+
+class DummyOpenFS(DummyTestFS):
+    blocksize = 10
+
+    def _open(self, path, mode="rb", **kwargs):
+        stream = open(path, mode)
+        stream.size = os.stat(path).st_size
+        return stream
+
+
+class BasicCallback(fsspec.Callback):
+    def __init__(self, **kwargs):
+        self.events = []
+        super().__init__(**kwargs)
+
+    def set_size(self, size):
+        self.events.append(("set_size", size))
+
+    def relative_update(self, inc=1):
+        self.events.append(("relative_update", inc))
+
+
+def imitate_transfer(size, chunk, *, file=True):
+    events = [("set_size", size)]
+    events.extend(("relative_update", size // chunk) for _ in range(chunk))
+    if file:
+        # The reason that there is a relative_update(0) at the
+        # end is that, we don't have an early exit on the
+        # implementations of get_file/put_file so it needs to
+        # go through the callback to get catch by the while's
+        # condition and then it will stop the transfer.
+        events.append(("relative_update", 0))
+
+    return events
+
+
+def get_files(tmpdir, amount=10):
+    src, dest, base = [], [], []
+    for index in range(amount):
+        src_path = tmpdir / f"src_{index}.txt"
+        src_path.write_text("x" * 50, "utf-8")
+
+        src.append(str(src_path))
+        dest.append(str(tmpdir / f"dst_{index}.txt"))
+        base.append(str(tmpdir / f"file_{index}.txt"))
+    return src, dest, base
+
+
+def test_dummy_callbacks_file(tmpdir):
+    fs = DummyOpenFS()
+    callback = BasicCallback()
+
+    file = tmpdir / "file.txt"
+    source = tmpdir / "tmp.txt"
+    destination = tmpdir / "tmp2.txt"
+
+    size = 100
+    source.write_text("x" * 100, "utf-8")
+
+    fs.put_file(source, file, callback=callback)
+
+    # -1 here since put_file no longer has final zero-size put
+    assert callback.events == imitate_transfer(size, 10)[:-1]
+    callback.events.clear()
+
+    fs.get_file(file, destination, callback=callback)
+    assert callback.events == imitate_transfer(size, 10)
+    callback.events.clear()
+
+    assert destination.read_text("utf-8") == "x" * 100
+
+
+def test_dummy_callbacks_files(tmpdir):
+    fs = DummyOpenFS()
+    callback = BasicCallback()
+    src, dest, base = get_files(tmpdir)
+
+    fs.put(src, base, callback=callback)
+    assert callback.events == imitate_transfer(10, 10, file=False)
+    callback.events.clear()
+
+    fs.get(base, dest, callback=callback)
+    assert callback.events == imitate_transfer(10, 10, file=False)
+
+
+class BranchableCallback(BasicCallback):
+    def __init__(self, source, dest=None, events=None, **kwargs):
+        super().__init__(**kwargs)
+        if dest:
+            self.key = source, dest
+        else:
+            self.key = (source,)
+        self.events = events or defaultdict(list)
+
+    def branch(self, path_1, path_2, kwargs):
+        from fsspec.implementations.local import make_path_posix
+
+        path_1 = make_path_posix(path_1)
+        path_2 = make_path_posix(path_2)
+        kwargs["callback"] = BranchableCallback(path_1, path_2, events=self.events)
+
+    def set_size(self, size):
+        self.events[self.key].append(("set_size", size))
+
+    def relative_update(self, inc=1):
+        self.events[self.key].append(("relative_update", inc))
+
+
+def test_dummy_callbacks_files_branched(tmpdir):
+    fs = DummyOpenFS()
+    src, dest, base = get_files(tmpdir)
+
+    callback = BranchableCallback("top-level")
+
+    def check_events(lpaths, rpaths):
+        from fsspec.implementations.local import make_path_posix
+
+        base_keys = zip(make_path_posix(lpaths), make_path_posix(rpaths))
+        assert set(callback.events.keys()) == {("top-level",), *base_keys}
+        assert callback.events["top-level",] == imitate_transfer(10, 10, file=False)
+
+        for key in base_keys:
+            assert callback.events[key] == imitate_transfer(50, 5)
+
+    fs.put(src, base, callback=callback)
+    check_events(src, base)
+    callback.events.clear()
+
+    fs.get(base, dest, callback=callback)
+    check_events(base, dest)
+    callback.events.clear()
+
+
+def _clean_paths(paths, prefix=""):
+    """
+    Helper to cleanup paths results by doing the following:
+      - remove the prefix provided from all paths
+      - remove the trailing slashes from all paths
+      - remove duplicates paths
+      - sort all paths
+    """
+    paths_list = paths
+    if isinstance(paths, dict):
+        paths_list = list(paths)
+    paths_list = [p.replace(prefix, "").strip("/") for p in sorted(set(paths_list))]
+    if isinstance(paths, dict):
+        return {p: paths[p] for p in paths_list}
+    return paths_list
+
+
+@pytest.fixture(scope="function")
+def glob_fs():
+    return DummyTestFS(fs_content=PATHS_FOR_GLOB_TESTS)
+
+
+@pytest.fixture(scope="function")
+def glob_files_folder(tmp_path):
+    local_fs = LocalFileSystem(auto_mkdir=True)
+    local_fake_dir = str(tmp_path)
+    for path_info in PATHS_FOR_GLOB_TESTS:
+        if path_info["type"] == "file":
+            local_fs.touch(path=f"{str(tmp_path)}/{path_info['name']}")
+    return local_fake_dir
+
+
+@pytest.mark.skipif(
+    sys.platform.startswith("win"),
+    reason="no need to run python glob posix tests on windows",
+)
+@pytest.mark.parametrize(
+    GLOB_POSIX_TESTS["argnames"],
+    GLOB_POSIX_TESTS["argvalues"],
+)
+def test_posix_tests_python_glob(path, expected, glob_files_folder):
+    """
+    Tests against python glob to check if our posix tests are accurate.
+    """
+    os.chdir(glob_files_folder)
+
+    python_output = glob.glob(pathname=path, recursive=True)
+    assert _clean_paths(python_output, glob_files_folder) == _clean_paths(expected)
+
+
+@pytest.mark.skipif(
+    sys.platform.startswith("win"),
+    reason="no need to run bash stat posix tests on windows",
+)
+@pytest.mark.parametrize(
+    GLOB_POSIX_TESTS["argnames"],
+    GLOB_POSIX_TESTS["argvalues"],
+)
+def test_posix_tests_bash_stat(path, expected, glob_files_folder):
+    """
+    Tests against bash stat to check if our posix tests are accurate.
+    """
+    try:
+        subprocess.check_output(["bash", "-c", "shopt -s globstar"])
+    except FileNotFoundError:
+        pytest.skip("bash is not available")
+    except subprocess.CalledProcessError:
+        pytest.skip("globstar option is not available")
+
+    bash_path = (
+        path.replace("\\", "\\\\")
+        .replace("$", "\\$")
+        .replace("(", "\\(")
+        .replace(")", "\\)")
+        .replace("|", "\\|")
+    )
+    bash_output = subprocess.run(
+        [
+            "bash",
+            "-c",
+            f"cd {glob_files_folder} && shopt -s globstar && stat -c %N {bash_path}",
+        ],
+        capture_output=True,
+        check=False,
+    )
+    # Remove the last element always empty
+    bash_output = bash_output.stdout.decode("utf-8").replace("'", "").split("\n")[:-1]
+    assert _clean_paths(bash_output, glob_files_folder) == _clean_paths(expected)
+
+
+@pytest.mark.parametrize(
+    GLOB_POSIX_TESTS["argnames"],
+    GLOB_POSIX_TESTS["argvalues"],
+)
+def test_glob_posix_rules(path, expected, glob_fs):
+    output = glob_fs.glob(path=f"mock://{path}")
+    assert _clean_paths(output) == _clean_paths(expected)
+
+    detailed_output = glob_fs.glob(path=f"mock://{path}", detail=True)
+    for name, info in _clean_paths(detailed_output).items():
+        assert info == glob_fs[name]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..10fa89a2d2ddbc3e393a9c880973167b27d96746
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/fsspec/tests/test_utils.py
@@ -0,0 +1,478 @@
+import io
+import sys
+from pathlib import Path, PurePath
+from unittest.mock import Mock
+
+import pytest
+
+import fsspec.utils
+from fsspec.utils import (
+    can_be_local,
+    common_prefix,
+    get_protocol,
+    infer_storage_options,
+    merge_offset_ranges,
+    mirror_from,
+    other_paths,
+    read_block,
+    seek_delimiter,
+    setup_logging,
+)
+
+WIN = sys.platform.startswith("win")
+
+
+def test_read_block():
+    delimiter = b"\n"
+    data = delimiter.join([b"123", b"456", b"789"])
+    f = io.BytesIO(data)
+
+    assert read_block(f, 1, 2) == b"23"
+    assert read_block(f, 0, 1, delimiter=b"\n") == b"123\n"
+    assert read_block(f, 0, 2, delimiter=b"\n") == b"123\n"
+    assert read_block(f, 0, 3, delimiter=b"\n") == b"123\n"
+    assert read_block(f, 0, 5, delimiter=b"\n") == b"123\n456\n"
+    assert read_block(f, 0, 8, delimiter=b"\n") == b"123\n456\n789"
+    assert read_block(f, 0, 100, delimiter=b"\n") == b"123\n456\n789"
+    assert read_block(f, 1, 1, delimiter=b"\n") == b""
+    assert read_block(f, 1, 5, delimiter=b"\n") == b"456\n"
+    assert read_block(f, 1, 8, delimiter=b"\n") == b"456\n789"
+
+    for ols in [[(0, 3), (3, 3), (6, 3), (9, 2)], [(0, 4), (4, 4), (8, 4)]]:
+        out = [read_block(f, o, l, b"\n") for o, l in ols]
+        assert b"".join(filter(None, out)) == data
+
+
+def test_read_block_split_before():
+    """Test start/middle/end cases of split_before."""  # noqa: I
+    d = (
+        "#header" + "".join(">foo{i}\nFOOBAR{i}\n".format(i=i) for i in range(100000))
+    ).encode()
+
+    # Read single record at beginning.
+    # All reads include beginning of file and read through termination of
+    # delimited record.
+    assert read_block(io.BytesIO(d), 0, 10, delimiter=b"\n") == b"#header>foo0\n"
+    assert (
+        read_block(io.BytesIO(d), 0, 10, delimiter=b"\n", split_before=True)
+        == b"#header>foo0"
+    )
+    assert (
+        read_block(io.BytesIO(d), 0, 10, delimiter=b">") == b"#header>foo0\nFOOBAR0\n>"
+    )
+    assert (
+        read_block(io.BytesIO(d), 0, 10, delimiter=b">", split_before=True)
+        == b"#header>foo0\nFOOBAR0\n"
+    )
+
+    # Read multiple records at beginning.
+    # All reads include beginning of file and read through termination of
+    # delimited record.
+    assert (
+        read_block(io.BytesIO(d), 0, 27, delimiter=b"\n")
+        == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n"
+    )
+    assert (
+        read_block(io.BytesIO(d), 0, 27, delimiter=b"\n", split_before=True)
+        == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1"
+    )
+    assert (
+        read_block(io.BytesIO(d), 0, 27, delimiter=b">")
+        == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n>"
+    )
+    assert (
+        read_block(io.BytesIO(d), 0, 27, delimiter=b">", split_before=True)
+        == b"#header>foo0\nFOOBAR0\n>foo1\nFOOBAR1\n"
+    )
+
+    # Read with offset spanning into next record, splits on either side of delimiter.
+    # Read not spanning the full record returns nothing.
+    assert read_block(io.BytesIO(d), 10, 3, delimiter=b"\n") == b"FOOBAR0\n"
+    assert (
+        read_block(io.BytesIO(d), 10, 3, delimiter=b"\n", split_before=True)
+        == b"\nFOOBAR0"
+    )
+    assert read_block(io.BytesIO(d), 10, 3, delimiter=b">") == b""
+    assert read_block(io.BytesIO(d), 10, 3, delimiter=b">", split_before=True) == b""
+
+    # Read with offset spanning multiple records, splits on either side of delimiter
+    assert (
+        read_block(io.BytesIO(d), 10, 20, delimiter=b"\n")
+        == b"FOOBAR0\n>foo1\nFOOBAR1\n"
+    )
+    assert (
+        read_block(io.BytesIO(d), 10, 20, delimiter=b"\n", split_before=True)
+        == b"\nFOOBAR0\n>foo1\nFOOBAR1"
+    )
+    assert read_block(io.BytesIO(d), 10, 20, delimiter=b">") == b"foo1\nFOOBAR1\n>"
+    assert (
+        read_block(io.BytesIO(d), 10, 20, delimiter=b">", split_before=True)
+        == b">foo1\nFOOBAR1\n"
+    )
+
+    # Read record at end, all records read to end
+
+    tlen = len(d)
+
+    assert (
+        read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n")
+        == b">foo99999\nFOOBAR99999\n"
+    )
+
+    assert (
+        read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b"\n", split_before=True)
+        == b"\n>foo99999\nFOOBAR99999\n"
+    )
+
+    assert (
+        read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">")
+        == b"foo99999\nFOOBAR99999\n"
+    )
+
+    assert (
+        read_block(io.BytesIO(d), tlen - 30, 35, delimiter=b">", split_before=True)
+        == b">foo99999\nFOOBAR99999\n"
+    )
+
+
+def test_seek_delimiter_endline():
+    f = io.BytesIO(b"123\n456\n789")
+
+    # if at zero, stay at zero
+    seek_delimiter(f, b"\n", 5)
+    assert f.tell() == 0
+
+    # choose the first block
+    for bs in [1, 5, 100]:
+        f.seek(1)
+        seek_delimiter(f, b"\n", blocksize=bs)
+        assert f.tell() == 4
+
+    # handle long delimiters well, even with short blocksizes
+    f = io.BytesIO(b"123abc456abc789")
+    for bs in [1, 2, 3, 4, 5, 6, 10]:
+        f.seek(1)
+        seek_delimiter(f, b"abc", blocksize=bs)
+        assert f.tell() == 6
+
+    # End at the end
+    f = io.BytesIO(b"123\n456")
+    f.seek(5)
+    seek_delimiter(f, b"\n", 5)
+    assert f.tell() == 7
+
+
+def test_infer_options():
+    so = infer_storage_options("/mnt/datasets/test.csv")
+    assert so.pop("protocol") == "file"
+    assert so.pop("path") == "/mnt/datasets/test.csv"
+    assert not so
+
+    assert infer_storage_options("./test.csv")["path"] == "./test.csv"
+    assert infer_storage_options("../test.csv")["path"] == "../test.csv"
+
+    so = infer_storage_options("C:\\test.csv")
+    assert so.pop("protocol") == "file"
+    assert so.pop("path") == "C:\\test.csv"
+    assert not so
+
+    assert infer_storage_options("d:\\test.csv")["path"] == "d:\\test.csv"
+    assert infer_storage_options("\\test.csv")["path"] == "\\test.csv"
+    assert infer_storage_options(".\\test.csv")["path"] == ".\\test.csv"
+    assert infer_storage_options("test.csv")["path"] == "test.csv"
+
+    so = infer_storage_options(
+        "hdfs://username:pwd@Node:123/mnt/datasets/test.csv?q=1#fragm",
+        inherit_storage_options={"extra": "value"},
+    )
+    assert so.pop("protocol") == "hdfs"
+    assert so.pop("username") == "username"
+    assert so.pop("password") == "pwd"
+    assert so.pop("host") == "Node"
+    assert so.pop("port") == 123
+    assert so.pop("path") == "/mnt/datasets/test.csv#fragm"
+    assert so.pop("url_query") == "q=1"
+    assert so.pop("url_fragment") == "fragm"
+    assert so.pop("extra") == "value"
+    assert not so
+
+    so = infer_storage_options("hdfs://User-name@Node-name.com/mnt/datasets/test.csv")
+    assert so.pop("username") == "User-name"
+    assert so.pop("host") == "Node-name.com"
+
+    u = "http://127.0.0.1:8080/test.csv"
+    assert infer_storage_options(u) == {"protocol": "http", "path": u}
+
+    # For s3 and gcs the netloc is actually the bucket name, so we want to
+    # include it in the path. Test that:
+    # - Parsing doesn't lowercase the bucket
+    # - The bucket is included in path
+    for protocol in ["s3", "s3a", "gcs", "gs"]:
+        options = infer_storage_options(f"{protocol}://Bucket-name.com/test.csv")
+        assert options["path"] == "Bucket-name.com/test.csv"
+
+    with pytest.raises(KeyError):
+        infer_storage_options("file:///bucket/file.csv", {"path": "collide"})
+    with pytest.raises(KeyError):
+        infer_storage_options("hdfs:///bucket/file.csv", {"protocol": "collide"})
+
+
+def test_infer_simple():
+    out = infer_storage_options("//mnt/datasets/test.csv")
+    assert out["protocol"] == "file"
+    assert out["path"] == "//mnt/datasets/test.csv"
+    assert out.get("host", None) is None
+
+
+@pytest.mark.parametrize(
+    "urlpath, expected_path",
+    (
+        (r"c:\foo\bar", r"c:\foo\bar"),
+        (r"C:\\foo\bar", r"C:\\foo\bar"),
+        (r"c:/foo/bar", r"c:/foo/bar"),
+        (r"file:///c|\foo\bar", r"c:\foo\bar"),
+        (r"file:///C|/foo/bar", r"C:/foo/bar"),
+        (r"file:///C:/foo/bar", r"C:/foo/bar"),
+    ),
+)
+def test_infer_storage_options_c(urlpath, expected_path):
+    so = infer_storage_options(urlpath)
+    assert so["protocol"] == "file"
+    assert so["path"] == expected_path
+
+
+@pytest.mark.parametrize(
+    "paths, out",
+    (
+        (["/more/dir/", "/more/dir/two", "/more/one", "/more/three"], "/more"),
+        (["/", "", "/"], ""),
+        (["/", "/"], "/"),
+        (["/more/", "/"], ""),
+        (["/more/", "/more"], "/more"),
+        (["more/dir/", "more/dir/two", "more/one", "more/three"], "more"),
+    ),
+)
+def test_common_prefix(paths, out):
+    assert common_prefix(paths) == out
+
+
+@pytest.mark.parametrize(
+    "paths, other, exists, expected",
+    (
+        (["/path1"], "/path2", False, ["/path2"]),
+        (["/path1"], "/path2", True, ["/path2/path1"]),
+        (["/path1"], "/path2", False, ["/path2"]),
+        (["/path1"], "/path2/", True, ["/path2/path1"]),
+        (["/path1"], ["/path2"], False, ["/path2"]),
+        (["/path1"], ["/path2"], True, ["/path2"]),
+        (["/path1", "/path2"], "/path2", False, ["/path2/path1", "/path2/path2"]),
+        (["/path1", "/path2"], "/path2", True, ["/path2/path1", "/path2/path2"]),
+        (
+            ["/more/path1", "/more/path2"],
+            "/path2",
+            False,
+            ["/path2/path1", "/path2/path2"],
+        ),
+        (
+            ["/more/path1", "/more/path2"],
+            "/path2",
+            True,
+            ["/path2/more/path1", "/path2/more/path2"],
+        ),
+        (
+            ["/more/path1", "/more/path2"],
+            "/path2",
+            False,
+            ["/path2/path1", "/path2/path2"],
+        ),
+        (
+            ["/more/path1", "/more/path2"],
+            "/path2",
+            True,
+            ["/path2/more/path1", "/path2/more/path2"],
+        ),
+        (
+            ["/more/path1", "/more/path2"],
+            "/path2/",
+            False,
+            ["/path2/path1", "/path2/path2"],
+        ),
+        (
+            ["/more/path1", "/more/path2"],
+            "/path2/",
+            True,
+            ["/path2/more/path1", "/path2/more/path2"],
+        ),
+        (
+            ["/more/path1", "/diff/path2"],
+            "/path2/",
+            False,
+            ["/path2/more/path1", "/path2/diff/path2"],
+        ),
+        (
+            ["/more/path1", "/diff/path2"],
+            "/path2/",
+            True,
+            ["/path2/more/path1", "/path2/diff/path2"],
+        ),
+        (["a", "b/", "b/c"], "dest/", False, ["dest/a", "dest/b/", "dest/b/c"]),
+        (
+            ["/a", "/b/", "/b/c"],
+            "dest/",
+            False,
+            ["dest/a", "dest/b/", "dest/b/c"],
+        ),
+    ),
+)
+def test_other_paths(paths, other, exists, expected):
+    assert other_paths(paths, other, exists) == expected
+
+
+def test_log():
+    import logging
+
+    logger = setup_logging(logger_name="fsspec.test")
+    assert logger.level == logging.DEBUG
+
+
+@pytest.mark.parametrize(
+    "par",
+    [
+        ("afile", "file"),
+        ("file://afile", "file"),
+        ("noproto://afile", "noproto"),
+        ("noproto::stuff", "noproto"),
+        ("simplecache::stuff", "simplecache"),
+        ("simplecache://stuff", "simplecache"),
+        ("s3://afile", "s3"),
+        (Path("afile"), "file"),
+    ],
+)
+def test_get_protocol(par):
+    url, outcome = par
+    assert get_protocol(url) == outcome
+
+
+@pytest.mark.parametrize(
+    "par",
+    [
+        ("afile", True),
+        ("file://afile", True),
+        ("noproto://afile", False),
+        ("noproto::stuff", False),
+        ("simplecache::stuff", True),
+        ("simplecache://stuff", True),
+        (Path("afile"), True),
+    ],
+)
+def test_can_local(par):
+    url, outcome = par
+    assert can_be_local(url) == outcome
+
+
+def test_mirror_from():
+    mock = Mock()
+    mock.attr = 1
+
+    @mirror_from("client", ["attr", "func_1", "func_2"])
+    class Real:
+        @property
+        def client(self):
+            return mock
+
+        def func_2(self):
+            assert False, "have to overwrite this"
+
+        def func_3(self):
+            return "should succeed"
+
+    obj = Real()
+    assert obj.attr == mock.attr
+
+    obj.func_1()
+    mock.func_1.assert_called()
+
+    obj.func_2(1, 2)
+    mock.func_2.assert_called_with(1, 2)
+
+    assert obj.func_3() == "should succeed"
+    mock.func_3.assert_not_called()
+
+
+@pytest.mark.parametrize("max_gap", [0, 32])
+@pytest.mark.parametrize("max_block", [None, 128])
+def test_merge_offset_ranges(max_gap, max_block):
+    # Input ranges
+    # (Using out-of-order ranges for full coverage)
+    paths = ["foo", "bar", "bar", "bar", "foo"]
+    starts = [0, 0, 512, 64, 32]
+    ends = [32, 32, 1024, 256, 64]
+
+    # Call merge_offset_ranges
+    (
+        result_paths,
+        result_starts,
+        result_ends,
+    ) = merge_offset_ranges(
+        paths,
+        starts,
+        ends,
+        max_gap=max_gap,
+        max_block=max_block,
+    )
+
+    # Check result
+    if max_block is None and max_gap == 32:
+        expect_paths = ["bar", "bar", "foo"]
+        expect_starts = [0, 512, 0]
+        expect_ends = [256, 1024, 64]
+    else:
+        expect_paths = ["bar", "bar", "bar", "foo"]
+        expect_starts = [0, 64, 512, 0]
+        expect_ends = [32, 256, 1024, 64]
+
+    assert expect_paths == result_paths
+    assert expect_starts == result_starts
+    assert expect_ends == result_ends
+
+
+def test_size():
+    f = io.BytesIO(b"hello")
+    assert fsspec.utils.file_size(f) == 5
+    assert f.tell() == 0
+
+
+class _HasFspath:
+    def __fspath__(self):
+        return "foo"
+
+
+class _HasPathAttr:
+    def __init__(self):
+        self.path = "foo"
+
+
+@pytest.mark.parametrize(
+    "path,expected",
+    [
+        # coerce to string
+        ("foo", "foo"),
+        (Path("foo"), "foo"),
+        (PurePath("foo"), "foo"),
+        (_HasFspath(), "foo"),
+        (_HasPathAttr(), "foo"),
+        # passthrough
+        (b"bytes", b"bytes"),
+        (None, None),
+        (1, 1),
+        (True, True),
+        (o := object(), o),
+        ([], []),
+        ((), ()),
+        (set(), set()),
+    ],
+)
+def test_stringify_path(path, expected):
+    path = fsspec.utils.stringify_path(path)
+
+    assert path == expected
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/huggingface_hub-0.36.2.dist-info/licenses/LICENSE b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/huggingface_hub-0.36.2.dist-info/licenses/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/huggingface_hub-0.36.2.dist-info/licenses/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/ElementSoup.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/ElementSoup.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64c547458e418e51e1e349ad1a90787f8d749fbe
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/ElementSoup.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7fa8596c874ef45b38a7e70664b1aa3af0ed9ef0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_diffcommand.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_diffcommand.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed34c182e00d05e773ce340be649b07cf1e198d5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_diffcommand.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_difflib.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_difflib.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52e4305a2362523a01e21c2e48ae9b0622298cac
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_difflib.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_html5builder.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_html5builder.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0622c73d42ecd0ec13da67fa22437eea6e3091a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_html5builder.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_setmixin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_setmixin.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41dc466109d6c974fb77e7af000d4cd1d60fe066
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/_setmixin.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/builder.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/builder.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b8de17cf8c3020747b2f9e2e092b9ef5fce5c96
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/builder.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/clean.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/clean.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..470e8112d8c81d8fa15336ff6e56fde0616924c2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/clean.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/defs.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/defs.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9551006a10c4e82a4d4ec3f06e81648c40d380b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/defs.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/diff.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/diff.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db44479c142c01c453c3991c68d1dff66166c368
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/diff.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/formfill.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/formfill.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a32d00b06f25b0bfd456fa482b5d0e459f97d3be
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/formfill.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/html5parser.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/html5parser.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5163f56add1e7fe8d3f667d9968d767bfcd6565
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/html5parser.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/soupparser.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/soupparser.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee47a83f78807fa33b573d23024a8f3df7df3a68
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/soupparser.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/usedoctest.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/usedoctest.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7984c0833594a220c42887f03f39ce3873a41926
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/__pycache__/usedoctest.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49599ddd7de19acbd26134b4c47f988a138a7dec
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/c14n.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/c14n.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..8b1f3c4c516b23ec938da286e0ddb7b8f5795ee2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/c14n.pxd
@@ -0,0 +1,25 @@
+from lxml.includes.tree cimport xmlDoc, xmlOutputBuffer, xmlChar
+from lxml.includes.xpath cimport xmlNodeSet
+
+cdef extern from "libxml/c14n.h" nogil:
+    cdef int xmlC14NDocDumpMemory(xmlDoc* doc,
+                                  xmlNodeSet* nodes,
+                                  int exclusive,
+                                  xmlChar** inclusive_ns_prefixes,
+                                  int with_comments,
+                                  xmlChar** doc_txt_ptr)
+
+    cdef int xmlC14NDocSave(xmlDoc* doc,
+                            xmlNodeSet* nodes,
+                            int exclusive,
+                            xmlChar** inclusive_ns_prefixes,
+                            int with_comments,
+                            char* filename,
+                            int compression)
+
+    cdef int xmlC14NDocSaveTo(xmlDoc* doc,
+                              xmlNodeSet* nodes,
+                              int exclusive,
+                              xmlChar** inclusive_ns_prefixes,
+                              int with_comments,
+                              xmlOutputBuffer* buffer)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/dtdvalid.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/dtdvalid.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..2ad49db11b20e4e710f4f55cf5590ef3928f1058
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/dtdvalid.pxd
@@ -0,0 +1,18 @@
+from lxml.includes cimport tree
+from lxml.includes.tree cimport xmlDoc, xmlDtd
+
+cdef extern from "libxml/valid.h" nogil:
+    ctypedef void (*xmlValidityErrorFunc)(void * ctx, const char * msg, ...) noexcept
+    ctypedef void (*xmlValidityWarningFunc)(void * ctx, const char * msg, ...) noexcept
+
+    ctypedef struct xmlValidCtxt:
+        void *userData
+        xmlValidityErrorFunc error
+        xmlValidityWarningFunc warning
+
+    cdef xmlValidCtxt* xmlNewValidCtxt()
+    cdef void xmlFreeValidCtxt(xmlValidCtxt* cur)
+
+    cdef int xmlValidateDtd(xmlValidCtxt* ctxt, xmlDoc* doc, xmlDtd* dtd)
+    cdef tree.xmlElement* xmlGetDtdElementDesc(
+        xmlDtd* dtd, tree.const_xmlChar* name)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/etree_defs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/etree_defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cb6eb3a58c5910770733e8837c5d296049fe4cb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/etree_defs.h
@@ -0,0 +1,390 @@
+#ifndef HAS_ETREE_DEFS_H
+#define HAS_ETREE_DEFS_H
+
+/* quick check for Python/libxml2/libxslt devel setup */
+#include "Python.h"
+#ifndef PY_VERSION_HEX
+#  error the development package of Python (header files etc.) is not installed correctly
+#elif PY_VERSION_HEX < 0x03060000
+#  error this version of lxml requires Python 3.6 or later
+#endif
+
+#include "libxml/xmlversion.h"
+#ifndef LIBXML_VERSION
+#  error the development package of libxml2 (header files etc.) is not installed correctly
+#elif LIBXML_VERSION < 20700
+#  error minimum required version of libxml2 is 2.7.0
+#endif
+
+#include "libxslt/xsltconfig.h"
+#ifndef LIBXSLT_VERSION
+#  error the development package of libxslt (header files etc.) is not installed correctly
+#elif LIBXSLT_VERSION < 10123
+#  error minimum required version of libxslt is 1.1.23
+#endif
+
+
+/* v_arg functions */
+#define va_int(ap)     va_arg(ap, int)
+#define va_charptr(ap) va_arg(ap, char *)
+
+#ifdef PYPY_VERSION
+#    define IS_PYPY 1
+#else
+#    define IS_PYPY 0
+#endif
+
+/* unused */
+#define IS_PYTHON2 0
+#define IS_PYTHON3 1
+#undef LXML_UNICODE_STRINGS
+#define LXML_UNICODE_STRINGS 1
+
+#if !IS_PYPY
+#  define PyWeakref_LockObject(obj)          (NULL)
+#endif
+
+/* Threading is not currently supported by PyPy */
+#if IS_PYPY
+#  ifndef WITHOUT_THREADING
+#    define WITHOUT_THREADING
+#  endif
+#endif
+
+#if IS_PYPY
+#  ifndef PyUnicode_FromFormat
+#    define PyUnicode_FromFormat  PyString_FromFormat
+#  endif
+#  if !defined(PyBytes_FromFormat)
+#    ifdef PyString_FromFormat
+#      define PyBytes_FromFormat  PyString_FromFormat
+#    else
+#include <stdarg.h>
+static PyObject* PyBytes_FromFormat(const char* format, ...) {
+    PyObject *string;
+    va_list vargs;
+#ifdef HAVE_STDARG_PROTOTYPES
+    va_start(vargs, format);
+#else
+    va_start(vargs);
+#endif
+    string = PyUnicode_FromFormatV(format, vargs);
+    va_end(vargs);
+    if (string && PyUnicode_Check(string)) {
+        PyObject *bstring = PyUnicode_AsUTF8String(string);
+        Py_DECREF(string);
+        string = bstring;
+    }
+    if (string && !PyBytes_CheckExact(string)) {
+        Py_DECREF(string);
+        string = NULL;
+        PyErr_SetString(PyExc_TypeError, "String formatting and encoding failed to return bytes object");
+    }
+    return string;
+}
+#    endif
+#  endif
+#endif
+
+#if PY_VERSION_HEX >= 0x030B00A1
+/* Python 3.12 doesn't have wstr Unicode strings any more. */
+#undef PyUnicode_GET_DATA_SIZE
+#define PyUnicode_GET_DATA_SIZE(ustr)  (0)
+#undef PyUnicode_AS_DATA
+#define PyUnicode_AS_DATA(ustr)  (NULL)
+#undef PyUnicode_IS_READY
+#define PyUnicode_IS_READY(ustr)  (1)
+#endif
+
+#ifdef WITHOUT_THREADING
+#  undef PyEval_SaveThread
+#  define PyEval_SaveThread() (NULL)
+#  undef PyEval_RestoreThread
+#  define PyEval_RestoreThread(state)  if (state); else {}
+#  undef PyGILState_Ensure
+#  define PyGILState_Ensure() (PyGILState_UNLOCKED)
+#  undef PyGILState_Release
+#  define PyGILState_Release(state)  if (state); else {}
+#  undef  Py_UNBLOCK_THREADS
+#  define Py_UNBLOCK_THREADS  _save = NULL;
+#  undef  Py_BLOCK_THREADS
+#  define Py_BLOCK_THREADS  if (_save); else {}
+#endif
+
+#ifdef WITHOUT_THREADING
+#  define ENABLE_THREADING 0
+#else
+#  define ENABLE_THREADING 1
+#endif
+
+#if LIBXML_VERSION < 20704
+/* FIXME: hack to make new error reporting compile in old libxml2 versions */
+#  define xmlStructuredErrorContext NULL
+#  define xmlXIncludeProcessTreeFlagsData(n,o,d) xmlXIncludeProcessTreeFlags(n,o)
+#endif
+
+/* schematron was added in libxml2 2.6.21 */
+#ifdef LIBXML_SCHEMATRON_ENABLED
+#  define ENABLE_SCHEMATRON 1
+#else
+#  define ENABLE_SCHEMATRON 0
+#  define XML_SCHEMATRON_OUT_QUIET 0
+#  define XML_SCHEMATRON_OUT_XML 0
+#  define XML_SCHEMATRON_OUT_ERROR 0
+   typedef void xmlSchematron;
+   typedef void xmlSchematronParserCtxt;
+   typedef void xmlSchematronValidCtxt;
+#  define xmlSchematronNewDocParserCtxt(doc) NULL
+#  define xmlSchematronNewParserCtxt(file) NULL
+#  define xmlSchematronParse(ctxt) NULL
+#  define xmlSchematronFreeParserCtxt(ctxt)
+#  define xmlSchematronFree(schema)
+#  define xmlSchematronNewValidCtxt(schema, options) NULL
+#  define xmlSchematronValidateDoc(ctxt, doc) 0
+#  define xmlSchematronFreeValidCtxt(ctxt)
+#  define xmlSchematronSetValidStructuredErrors(ctxt, errorfunc, data)
+#endif
+
+#if LIBXML_VERSION < 20708
+#  define HTML_PARSE_NODEFDTD 4
+#endif
+#if LIBXML_VERSION < 20900
+#  define XML_PARSE_BIG_LINES 0x400000
+#endif
+#if LIBXML_VERSION < 21300
+#  define XML_PARSE_NO_XXE 0x800000
+#endif
+#if LIBXML_VERSION < 21400
+#  define XML_PARSE_UNZIP 0x1000000
+#  define XML_PARSE_NO_SYS_CATALOG 0x2000000
+#  define XML_PARSE_CATALOG_PI 0x4000000
+#endif
+#if LIBXML_VERSION < 21500
+#  define XML_PARSE_SKIP_IDS 0x8000000
+#endif
+
+#include "libxml/tree.h"
+#ifndef LIBXML2_NEW_BUFFER
+   typedef xmlBuffer xmlBuf;
+#  define xmlBufContent(buf) xmlBufferContent(buf)
+#  define xmlBufUse(buf) xmlBufferLength(buf)
+#endif
+
+#if LIBXML_VERSION < 21500
+#  define xmlCtxtIsStopped(p_ctxt)  ((p_ctxt)->disableSAX != 0)
+#endif
+
+/* libexslt 1.1.25+ support EXSLT functions in XPath */
+#if LIBXSLT_VERSION < 10125
+#define exsltDateXpathCtxtRegister(ctxt, prefix)
+#define exsltSetsXpathCtxtRegister(ctxt, prefix)
+#define exsltMathXpathCtxtRegister(ctxt, prefix)
+#define exsltStrXpathCtxtRegister(ctxt, prefix)
+#endif
+
+#define LXML_GET_XSLT_ENCODING(result_var, style) XSLT_GET_IMPORT_PTR(result_var, style, encoding)
+
+/* work around MSDEV 6.0 */
+#if (_MSC_VER == 1200) && (WINVER < 0x0500)
+long _ftol( double ); //defined by VC6 C libs
+long _ftol2( double dblSource ) { return _ftol( dblSource ); }
+#endif
+
+#ifdef __GNUC__
+/* Test for GCC > 2.95 */
+#if __GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95))
+#define unlikely_condition(x) __builtin_expect((x), 0)
+#else /* __GNUC__ > 2 ... */
+#define unlikely_condition(x) (x)
+#endif /* __GNUC__ > 2 ... */
+#else /* __GNUC__ */
+#define unlikely_condition(x) (x)
+#endif /* __GNUC__ */
+
+#ifndef Py_TYPE
+  #define Py_TYPE(ob)   (((PyObject*)(ob))->ob_type)
+#endif
+
+#define _fqtypename(o)  ((Py_TYPE(o))->tp_name)
+
+#define lxml_malloc(count, item_size) \
+    (unlikely_condition((size_t)(count) > (size_t) (PY_SSIZE_T_MAX / item_size)) ? NULL : \
+     (PyMem_Malloc((count) * item_size)))
+
+#define lxml_realloc(mem, count, item_size) \
+    (unlikely_condition((size_t)(count) > (size_t) (PY_SSIZE_T_MAX / item_size)) ? NULL : \
+     (PyMem_Realloc(mem, (count) * item_size)))
+
+#define lxml_free(mem)  PyMem_Free(mem)
+
+#define _isString(obj)   (PyUnicode_Check(obj) || PyBytes_Check(obj))
+
+#define _isElement(c_node) \
+        (((c_node)->type == XML_ELEMENT_NODE) || \
+         ((c_node)->type == XML_COMMENT_NODE) || \
+         ((c_node)->type == XML_ENTITY_REF_NODE) || \
+         ((c_node)->type == XML_PI_NODE))
+
+#define _isElementOrXInclude(c_node) \
+        (_isElement(c_node)                     || \
+         ((c_node)->type == XML_XINCLUDE_START) || \
+         ((c_node)->type == XML_XINCLUDE_END))
+
+#define _getNs(c_node) \
+        (((c_node)->ns == 0) ? 0 : ((c_node)->ns->href))
+
+
+#include "string.h"
+static void* lxml_unpack_xmldoc_capsule(PyObject* capsule, int* is_owned) {
+    xmlDoc *c_doc;
+    void *context;
+    *is_owned = 0;
+    if (unlikely_condition(!PyCapsule_IsValid(capsule, (const char*)"libxml2:xmlDoc"))) {
+        PyErr_SetString(
+                PyExc_TypeError,
+                "Not a valid capsule. The capsule argument must be a capsule object with name libxml2:xmlDoc");
+        return NULL;
+    }
+    c_doc = (xmlDoc*) PyCapsule_GetPointer(capsule, (const char*)"libxml2:xmlDoc");
+    if (unlikely_condition(!c_doc)) return NULL;
+
+    if (unlikely_condition(c_doc->type != XML_DOCUMENT_NODE && c_doc->type != XML_HTML_DOCUMENT_NODE)) {
+        PyErr_Format(
+            PyExc_ValueError,
+            "Illegal document provided: expected XML or HTML, found %d", (int)c_doc->type);
+        return NULL;
+    }
+
+    context = PyCapsule_GetContext(capsule);
+    if (unlikely_condition(!context && PyErr_Occurred())) return NULL;
+    if (context && strcmp((const char*) context, "destructor:xmlFreeDoc") == 0) {
+        /* take ownership by setting destructor to NULL */
+        if (PyCapsule_SetDestructor(capsule, NULL) == 0) {
+            /* ownership transferred => invalidate capsule by clearing its name */
+            if (unlikely_condition(PyCapsule_SetName(capsule, NULL))) {
+                /* this should never happen since everything above succeeded */
+                xmlFreeDoc(c_doc);
+                return NULL;
+            }
+            *is_owned = 1;
+        }
+    }
+    return c_doc;
+}
+
+/* Macro pair implementation of a depth first tree walker
+ *
+ * Calls the code block between the BEGIN and END macros for all elements
+ * below c_tree_top (exclusively), starting at c_node (inclusively iff
+ * 'inclusive' is 1).  The _ELEMENT_ variants will only stop on nodes
+ * that match _isElement(), the normal variant will stop on every node
+ * except text nodes.
+ *
+ * To traverse the node and all of its children and siblings in Pyrex, call
+ *    cdef xmlNode* some_node
+ *    BEGIN_FOR_EACH_ELEMENT_FROM(some_node.parent, some_node, 1)
+ *    # do something with some_node
+ *    END_FOR_EACH_ELEMENT_FROM(some_node)
+ *
+ * To traverse only the children and siblings of a node, call
+ *    cdef xmlNode* some_node
+ *    BEGIN_FOR_EACH_ELEMENT_FROM(some_node.parent, some_node, 0)
+ *    # do something with some_node
+ *    END_FOR_EACH_ELEMENT_FROM(some_node)
+ *
+ * To traverse only the children, do:
+ *    cdef xmlNode* some_node
+ *    some_node = parent_node.children
+ *    BEGIN_FOR_EACH_ELEMENT_FROM(parent_node, some_node, 1)
+ *    # do something with some_node
+ *    END_FOR_EACH_ELEMENT_FROM(some_node)
+ *
+ * NOTE: 'some_node' MUST be a plain 'xmlNode*' !
+ *
+ * NOTE: parent modification during the walk can divert the iterator, but
+ *       should not segfault !
+ */
+
+#define _LX__ELEMENT_MATCH(c_node, only_elements)  \
+    ((only_elements) ? (_isElement(c_node)) : 1)
+
+#define _LX__ADVANCE_TO_NEXT(c_node, only_elements)                        \
+    while ((c_node != 0) && (!_LX__ELEMENT_MATCH(c_node, only_elements)))  \
+        c_node = c_node->next;
+
+#define _LX__TRAVERSE_TO_NEXT(c_stop_node, c_node, only_elements)   \
+{                                                                   \
+    /* walk through children first */                               \
+    xmlNode* _lx__next = c_node->children;		            \
+    if (_lx__next != 0) {                                           \
+        if (c_node->type == XML_ENTITY_REF_NODE || c_node->type == XML_DTD_NODE) { \
+            _lx__next = 0;                                          \
+        } else {                                                    \
+            _LX__ADVANCE_TO_NEXT(_lx__next, only_elements)	    \
+        }                                                           \
+    }							            \
+    if ((_lx__next == 0) && (c_node != c_stop_node)) {              \
+        /* try siblings */                                          \
+        _lx__next = c_node->next;                                   \
+        _LX__ADVANCE_TO_NEXT(_lx__next, only_elements)              \
+        /* back off through parents */                              \
+        while (_lx__next == 0) {                                    \
+            c_node = c_node->parent;                                \
+            if (c_node == 0)                                        \
+                break;                                              \
+            if (c_node == c_stop_node)                              \
+                break;                                              \
+            if ((only_elements) && !_isElement(c_node))	            \
+                break;                                              \
+            /* we already traversed the parents -> siblings */      \
+            _lx__next = c_node->next;                               \
+            _LX__ADVANCE_TO_NEXT(_lx__next, only_elements)	    \
+        }                                                           \
+    }                                                               \
+    c_node = _lx__next;                                             \
+}
+
+#define _LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, only_elements)     \
+{									      \
+    if (c_node != 0) {							      \
+        const xmlNode* _lx__tree_top = (c_tree_top);                          \
+        const int _lx__only_elements = (only_elements);                       \
+        /* make sure we start at an element */                   	      \
+        if (!_LX__ELEMENT_MATCH(c_node, _lx__only_elements)) {		      \
+            /* we skip the node, so 'inclusive' is irrelevant */              \
+            if (c_node == _lx__tree_top)                                      \
+                c_node = 0; /* nothing to traverse */                         \
+            else {                                                            \
+                c_node = c_node->next;                                        \
+                _LX__ADVANCE_TO_NEXT(c_node, _lx__only_elements)              \
+            }                                                                 \
+        } else if (! (inclusive)) {                                           \
+            /* skip the first node */                                         \
+            _LX__TRAVERSE_TO_NEXT(_lx__tree_top, c_node, _lx__only_elements)  \
+        }                                                                     \
+                                                                              \
+        /* now run the user code on the elements we find */                   \
+        while (c_node != 0) {                                                 \
+            /* here goes the code to be run for each element */
+
+#define _LX__END_FOR_EACH_FROM(c_node)                                        \
+            _LX__TRAVERSE_TO_NEXT(_lx__tree_top, c_node, _lx__only_elements)  \
+        }                                                                     \
+    }                                                                         \
+}
+
+
+#define BEGIN_FOR_EACH_ELEMENT_FROM(c_tree_top, c_node, inclusive)   \
+    _LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, 1)
+
+#define END_FOR_EACH_ELEMENT_FROM(c_node)   \
+    _LX__END_FOR_EACH_FROM(c_node)
+
+#define BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive)   \
+    _LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, 0)
+
+#define END_FOR_EACH_FROM(c_node)   \
+    _LX__END_FOR_EACH_FROM(c_node)
+
+
+#endif /* HAS_ETREE_DEFS_H */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/etreepublic.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/etreepublic.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..7ef001b17b765de2a94172412681e6029e931c54
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/etreepublic.pxd
@@ -0,0 +1,237 @@
+# public Cython/C interface to lxml.etree
+
+from lxml.includes cimport tree
+from lxml.includes.tree cimport const_xmlChar
+
+cdef extern from "lxml-version.h":
+    cdef char* LXML_VERSION_STRING
+
+cdef extern from "etree_defs.h":
+    # test if c_node is considered an Element (i.e. Element, Comment, etc.)
+    cdef bint _isElement(tree.xmlNode* c_node) noexcept nogil
+
+    # return the namespace URI of the node or NULL
+    cdef const_xmlChar* _getNs(tree.xmlNode* node) noexcept nogil
+
+    # pair of macros for tree traversal
+    cdef void BEGIN_FOR_EACH_ELEMENT_FROM(tree.xmlNode* tree_top,
+                                          tree.xmlNode* start_node,
+                                          int start_node_inclusive) noexcept nogil
+    cdef void END_FOR_EACH_ELEMENT_FROM(tree.xmlNode* start_node) noexcept nogil
+
+cdef extern from "etree_api.h":
+
+    # first function to call!
+    cdef int import_lxml__etree() except -1
+
+    ##########################################################################
+    # public ElementTree API classes
+
+    cdef class lxml.etree._Document [ object LxmlDocument ]:
+        cdef tree.xmlDoc* _c_doc
+
+    cdef class lxml.etree._Element [ object LxmlElement ]:
+        cdef _Document _doc
+        cdef tree.xmlNode* _c_node
+
+    cdef class lxml.etree.ElementBase(_Element) [ object LxmlElementBase ]:
+        pass
+
+    cdef class lxml.etree._ElementTree [ object LxmlElementTree ]:
+        cdef _Document _doc
+        cdef _Element  _context_node
+
+    cdef class lxml.etree.ElementClassLookup [ object LxmlElementClassLookup ]:
+        cdef object (*_lookup_function)(object, _Document, tree.xmlNode*)
+
+    cdef class lxml.etree.FallbackElementClassLookup(ElementClassLookup) \
+             [ object LxmlFallbackElementClassLookup ]:
+        cdef ElementClassLookup fallback
+        cdef object (*_fallback_function)(object, _Document, tree.xmlNode*)
+
+    ##########################################################################
+    # creating Element objects
+
+    # create an Element for a C-node in the Document
+    cdef _Element elementFactory(_Document doc, tree.xmlNode* c_node)
+
+    # create an ElementTree for an Element
+    cdef _ElementTree elementTreeFactory(_Element context_node)
+
+    # create an ElementTree subclass for an Element
+    cdef _ElementTree newElementTree(_Element context_node, object subclass)
+
+    # create an ElementTree from an external document
+    cdef _ElementTree adoptExternalDocument(tree.xmlDoc* c_doc, parser, bint is_owned)
+
+    # create a new Element for an existing or new document (doc = None)
+    # builds Python object after setting text, tail, namespaces and attributes
+    cdef _Element makeElement(tag, _Document doc, parser,
+                              text, tail, attrib, nsmap)
+
+    # create a new SubElement for an existing parent
+    # builds Python object after setting text, tail, namespaces and attributes
+    cdef _Element makeSubElement(_Element parent, tag, text, tail,
+                                 attrib, nsmap)
+
+    # deep copy a node to include it in the Document
+    cdef _Element deepcopyNodeToDocument(_Document doc, tree.xmlNode* c_root)
+
+    # set the internal lookup function for Element/Comment/PI classes
+    # use setElementClassLookupFunction(NULL, None) to reset it
+    # note that the lookup function *must always* return an _Element subclass!
+    cdef void setElementClassLookupFunction(
+         object (*function)(object, _Document, tree.xmlNode*), object state)
+
+    # lookup function that always returns the default Element class
+    # note that the first argument is expected to be None!
+    cdef object lookupDefaultElementClass(_1, _Document _2,
+                                          tree.xmlNode* c_node)
+
+    # lookup function for namespace/tag specific Element classes
+    # note that the first argument is expected to be None!
+    cdef object lookupNamespaceElementClass(_1, _Document _2,
+                                            tree.xmlNode* c_node)
+
+    # call the fallback lookup function of a FallbackElementClassLookup
+    cdef object callLookupFallback(FallbackElementClassLookup lookup,
+                                   _Document doc, tree.xmlNode* c_node)
+
+    ##########################################################################
+    # XML attribute access
+
+    # return an attribute value for a C attribute on a C element node
+    cdef unicode attributeValue(tree.xmlNode* c_element,
+                                tree.xmlAttr* c_attrib_node)
+
+    # return the value of the attribute with 'ns' and 'name' (or None)
+    cdef unicode attributeValueFromNsName(tree.xmlNode* c_element,
+                                          const_xmlChar* c_ns, const_xmlChar* c_name)
+
+    # return the value of attribute "{ns}name", or the default value
+    cdef object getAttributeValue(_Element element, key, default)
+
+    # return an iterator over attribute names (1), values (2) or items (3)
+    # attributes must not be removed during iteration!
+    cdef object iterattributes(_Element element, int keysvalues)
+
+    # return the list of all attribute names (1), values (2) or items (3)
+    cdef list collectAttributes(tree.xmlNode* c_element, int keysvalues)
+
+    # set an attribute value on an element
+    # on failure, sets an exception and returns -1
+    cdef int setAttributeValue(_Element element, key, value) except -1
+
+    # delete an attribute
+    # on failure, sets an exception and returns -1
+    cdef int delAttribute(_Element element, key) except -1
+
+    # delete an attribute based on name and namespace URI
+    # returns -1 if the attribute was not found (no exception)
+    cdef int delAttributeFromNsName(tree.xmlNode* c_element,
+                                    const_xmlChar* c_href, const_xmlChar* c_name) noexcept
+
+    ##########################################################################
+    # XML node helper functions
+
+    # check if the element has at least one child
+    cdef bint hasChild(tree.xmlNode* c_node) noexcept nogil
+
+    # find child element number 'index' (supports negative indexes)
+    cdef tree.xmlNode* findChild(tree.xmlNode* c_node,
+                                 Py_ssize_t index) noexcept nogil
+
+    # find child element number 'index' starting at first one
+    cdef tree.xmlNode* findChildForwards(tree.xmlNode* c_node,
+                                         Py_ssize_t index) nogil
+
+    # find child element number 'index' starting at last one
+    cdef tree.xmlNode* findChildBackwards(tree.xmlNode* c_node,
+                                          Py_ssize_t index) nogil
+
+    # return next/previous sibling element of the node
+    cdef tree.xmlNode* nextElement(tree.xmlNode* c_node) nogil
+    cdef tree.xmlNode* previousElement(tree.xmlNode* c_node) nogil
+
+    ##########################################################################
+    # iterators (DEPRECATED API, don't use in new code!)
+
+    cdef class lxml.etree._ElementTagMatcher [ object LxmlElementTagMatcher ]:
+        cdef char* _href
+        cdef char* _name
+
+    # store "{ns}tag" (or None) filter for this matcher or element iterator
+    # ** unless _href *and* _name are set up 'by hand', this function *must*
+    # ** be called when subclassing the iterator below!
+    cdef void initTagMatch(_ElementTagMatcher matcher, tag)
+
+    cdef class lxml.etree._ElementIterator(_ElementTagMatcher) [
+        object LxmlElementIterator ]:
+        cdef _Element _node
+        cdef tree.xmlNode* (*_next_element)(tree.xmlNode*)
+
+    # store the initial node of the iterator if it matches the required tag
+    # or its next matching sibling if not
+    cdef void iteratorStoreNext(_ElementIterator iterator, _Element node)
+
+    ##########################################################################
+    # other helper functions
+
+    # check if a C node matches a tag name and namespace
+    # (NULL allowed for each => always matches)
+    cdef int tagMatches(tree.xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name)
+
+    # convert a UTF-8 char* to a Python unicode string
+    cdef unicode pyunicode(const_xmlChar* s)
+
+    # convert the string to UTF-8 using the normal lxml.etree semantics
+    cdef bytes utf8(object s)
+
+    # split a tag into a (URI, name) tuple, return None as URI for '{}tag'
+    cdef tuple getNsTag(object tag)
+
+    # split a tag into a (URI, name) tuple, return b'' as URI for '{}tag'
+    cdef tuple getNsTagWithEmptyNs(object tag)
+
+    # get the "{ns}tag" string for a C node
+    cdef unicode namespacedName(tree.xmlNode* c_node)
+
+    # get the "{ns}tag" string for a href/tagname pair (c_ns may be NULL)
+    cdef unicode namespacedNameFromNsName(const_xmlChar* c_ns, const_xmlChar* c_tag)
+
+    # check if the node has a text value (which may be '')
+    cdef bint hasText(tree.xmlNode* c_node) nogil
+
+    # check if the node has a tail value (which may be '')
+    cdef bint hasTail(tree.xmlNode* c_node) nogil
+
+    # get the text content of an element (or None)
+    cdef unicode textOf(tree.xmlNode* c_node)
+
+    # get the tail content of an element (or None)
+    cdef unicode tailOf(tree.xmlNode* c_node)
+
+    # set the text value of an element
+    cdef int setNodeText(tree.xmlNode* c_node, text) except -1
+
+    # set the tail text value of an element
+    cdef int setTailText(tree.xmlNode* c_node, text) except -1
+
+    # append an element to the children of a parent element
+    # deprecated: don't use, does not propagate exceptions!
+    # use appendChildToElement() instead
+    cdef void appendChild(_Element parent, _Element child)
+
+    # added in lxml 3.3 as a safe replacement for appendChild()
+    # return -1 for exception, 0 for ok
+    cdef int appendChildToElement(_Element parent, _Element child) except -1
+
+    # recursively lookup a namespace in element or ancestors, or create it
+    cdef tree.xmlNs* findOrBuildNodeNsPrefix(
+        _Document doc, tree.xmlNode* c_node, const_xmlChar* href, const_xmlChar* prefix)
+
+    # find the Document of an Element, ElementTree or Document (itself!)
+    cdef _Document documentOrRaise(object input)
+
+    # find the root Element of an Element (itself!), ElementTree or Document
+    cdef _Element rootNodeOrRaise(object input)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b90e0754aa815daecf3b228e11375a0a7e9ea876
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/libcharset.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/libcharset.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcf22748101279e454fd2fefe01908fd8545bce2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/libcharset.h
@@ -0,0 +1,45 @@
+/* Copyright (C) 2003 Free Software Foundation, Inc.
+   This file is part of the GNU CHARSET Library.
+
+   The GNU CHARSET Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The GNU CHARSET Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with the GNU CHARSET Library; see the file COPYING.LIB.  If not,
+   see <https://www.gnu.org/licenses/>.  */
+
+#ifndef _LIBCHARSET_H
+#define _LIBCHARSET_H
+
+#include <localcharset.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Support for relocatable packages.  */
+
+/* Sets the original and the current installation prefix of the package.
+   Relocation simply replaces a pathname starting with the original prefix
+   by the corresponding pathname with the current prefix instead.  Both
+   prefixes should be directory names without trailing slash (i.e. use ""
+   instead of "/").  */
+extern void libcharset_set_relocation_prefix (const char *orig_prefix,
+                                              const char *curr_prefix);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* _LIBCHARSET_H */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/localcharset.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/localcharset.h
new file mode 100644
index 0000000000000000000000000000000000000000..34ce0adde9bb793f1c1cd5f81b5cc3d2eff08ab1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/localcharset.h
@@ -0,0 +1,137 @@
+/* Determine a canonical name for the current locale's character encoding.
+   Copyright (C) 2000-2003, 2009-2019 Free Software Foundation, Inc.
+   This file is part of the GNU CHARSET Library.
+
+   This program is free software; you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program; if not, see <https://www.gnu.org/licenses/>.  */
+
+#ifndef _LOCALCHARSET_H
+#define _LOCALCHARSET_H
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Determine the current locale's character encoding, and canonicalize it
+   into one of the canonical names listed below.
+   The result must not be freed; it is statically allocated.  The result
+   becomes invalid when setlocale() is used to change the global locale, or
+   when the value of one of the environment variables LC_ALL, LC_CTYPE, LANG
+   is changed; threads in multithreaded programs should not do this.
+   If the canonical name cannot be determined, the result is a non-canonical
+   name.  */
+extern const char * locale_charset (void);
+
+/* About GNU canonical names for character encodings:
+
+   Every canonical name must be supported by GNU libiconv.  Support by GNU libc
+   is also desirable.
+
+   The name is case insensitive.  Usually an upper case MIME charset name is
+   preferred.
+
+   The current list of these GNU canonical names is:
+
+       name              MIME?             used by which systems
+                                    (darwin = Mac OS X, windows = native Windows)
+
+   ASCII, ANSI_X3.4-1968       glibc solaris freebsd netbsd darwin minix cygwin
+   ISO-8859-1              Y   glibc aix hpux irix osf solaris freebsd netbsd openbsd darwin cygwin zos
+   ISO-8859-2              Y   glibc aix hpux irix osf solaris freebsd netbsd openbsd darwin cygwin zos
+   ISO-8859-3              Y   glibc solaris cygwin
+   ISO-8859-4              Y   hpux osf solaris freebsd netbsd openbsd darwin
+   ISO-8859-5              Y   glibc aix hpux irix osf solaris freebsd netbsd openbsd darwin cygwin zos
+   ISO-8859-6              Y   glibc aix hpux solaris cygwin
+   ISO-8859-7              Y   glibc aix hpux irix osf solaris freebsd netbsd openbsd darwin cygwin zos
+   ISO-8859-8              Y   glibc aix hpux osf solaris cygwin zos
+   ISO-8859-9              Y   glibc aix hpux irix osf solaris freebsd darwin cygwin zos
+   ISO-8859-13                 glibc hpux solaris freebsd netbsd openbsd darwin cygwin
+   ISO-8859-14                 glibc cygwin
+   ISO-8859-15                 glibc aix irix osf solaris freebsd netbsd openbsd darwin cygwin
+   KOI8-R                  Y   glibc hpux solaris freebsd netbsd openbsd darwin
+   KOI8-U                  Y   glibc freebsd netbsd openbsd darwin cygwin
+   KOI8-T                      glibc
+   CP437                       dos
+   CP775                       dos
+   CP850                       aix osf dos
+   CP852                       dos
+   CP855                       dos
+   CP856                       aix
+   CP857                       dos
+   CP861                       dos
+   CP862                       dos
+   CP864                       dos
+   CP865                       dos
+   CP866                       freebsd netbsd openbsd darwin dos
+   CP869                       dos
+   CP874                       windows dos
+   CP922                       aix
+   CP932                       aix cygwin windows dos
+   CP943                       aix zos
+   CP949                       osf darwin windows dos
+   CP950                       windows dos
+   CP1046                      aix
+   CP1124                      aix
+   CP1125                      dos
+   CP1129                      aix
+   CP1131                      freebsd darwin
+   CP1250                      windows
+   CP1251                      glibc hpux solaris freebsd netbsd openbsd darwin cygwin windows
+   CP1252                      aix windows
+   CP1253                      windows
+   CP1254                      windows
+   CP1255                      glibc windows
+   CP1256                      windows
+   CP1257                      windows
+   GB2312                  Y   glibc aix hpux irix solaris freebsd netbsd darwin cygwin zos
+   EUC-JP                  Y   glibc aix hpux irix osf solaris freebsd netbsd darwin cygwin
+   EUC-KR                  Y   glibc aix hpux irix osf solaris freebsd netbsd darwin cygwin zos
+   EUC-TW                      glibc aix hpux irix osf solaris netbsd
+   BIG5                    Y   glibc aix hpux osf solaris freebsd netbsd darwin cygwin zos
+   BIG5-HKSCS                  glibc hpux solaris netbsd darwin
+   GBK                         glibc aix osf solaris freebsd darwin cygwin windows dos
+   GB18030                     glibc hpux solaris freebsd netbsd darwin
+   SHIFT_JIS               Y   hpux osf solaris freebsd netbsd darwin
+   JOHAB                       glibc solaris windows
+   TIS-620                     glibc aix hpux osf solaris cygwin zos
+   VISCII                  Y   glibc
+   TCVN5712-1                  glibc
+   ARMSCII-8                   glibc freebsd netbsd darwin
+   GEORGIAN-PS                 glibc cygwin
+   PT154                       glibc netbsd cygwin
+   HP-ROMAN8                   hpux
+   HP-ARABIC8                  hpux
+   HP-GREEK8                   hpux
+   HP-HEBREW8                  hpux
+   HP-TURKISH8                 hpux
+   HP-KANA8                    hpux
+   DEC-KANJI                   osf
+   DEC-HANYU                   osf
+   UTF-8                   Y   glibc aix hpux osf solaris netbsd darwin cygwin zos
+
+   Note: Names which are not marked as being a MIME name should not be used in
+   Internet protocols for information interchange (mail, news, etc.).
+
+   Note: ASCII and ANSI_X3.4-1968 are synonymous canonical names.  Applications
+   must understand both names and treat them as equivalent.
+ */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* _LOCALCHARSET_H */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/zconf.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/zconf.h
new file mode 100644
index 0000000000000000000000000000000000000000..ede3c82e3eb129528194a2045c808b418fb20296
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/zconf.h
@@ -0,0 +1,543 @@
+/* zconf.h -- configuration of the zlib compression library
+ * Copyright (C) 1995-2024 Jean-loup Gailly, Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* @(#) $Id$ */
+
+#ifndef ZCONF_H
+#define ZCONF_H
+
+/*
+ * If you *really* need a unique prefix for all types and library functions,
+ * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it.
+ * Even better than compiling with -DZ_PREFIX would be to use configure to set
+ * this permanently in zconf.h using "./configure --zprefix".
+ */
+#ifdef Z_PREFIX     /* may be set to #if 1 by ./configure */
+#  define Z_PREFIX_SET
+
+/* all linked symbols and init macros */
+#  define _dist_code            z__dist_code
+#  define _length_code          z__length_code
+#  define _tr_align             z__tr_align
+#  define _tr_flush_bits        z__tr_flush_bits
+#  define _tr_flush_block       z__tr_flush_block
+#  define _tr_init              z__tr_init
+#  define _tr_stored_block      z__tr_stored_block
+#  define _tr_tally             z__tr_tally
+#  define adler32               z_adler32
+#  define adler32_combine       z_adler32_combine
+#  define adler32_combine64     z_adler32_combine64
+#  define adler32_z             z_adler32_z
+#  ifndef Z_SOLO
+#    define compress              z_compress
+#    define compress2             z_compress2
+#    define compressBound         z_compressBound
+#  endif
+#  define crc32                 z_crc32
+#  define crc32_combine         z_crc32_combine
+#  define crc32_combine64       z_crc32_combine64
+#  define crc32_combine_gen     z_crc32_combine_gen
+#  define crc32_combine_gen64   z_crc32_combine_gen64
+#  define crc32_combine_op      z_crc32_combine_op
+#  define crc32_z               z_crc32_z
+#  define deflate               z_deflate
+#  define deflateBound          z_deflateBound
+#  define deflateCopy           z_deflateCopy
+#  define deflateEnd            z_deflateEnd
+#  define deflateGetDictionary  z_deflateGetDictionary
+#  define deflateInit           z_deflateInit
+#  define deflateInit2          z_deflateInit2
+#  define deflateInit2_         z_deflateInit2_
+#  define deflateInit_          z_deflateInit_
+#  define deflateParams         z_deflateParams
+#  define deflatePending        z_deflatePending
+#  define deflatePrime          z_deflatePrime
+#  define deflateReset          z_deflateReset
+#  define deflateResetKeep      z_deflateResetKeep
+#  define deflateSetDictionary  z_deflateSetDictionary
+#  define deflateSetHeader      z_deflateSetHeader
+#  define deflateTune           z_deflateTune
+#  define deflate_copyright     z_deflate_copyright
+#  define get_crc_table         z_get_crc_table
+#  ifndef Z_SOLO
+#    define gz_error              z_gz_error
+#    define gz_intmax             z_gz_intmax
+#    define gz_strwinerror        z_gz_strwinerror
+#    define gzbuffer              z_gzbuffer
+#    define gzclearerr            z_gzclearerr
+#    define gzclose               z_gzclose
+#    define gzclose_r             z_gzclose_r
+#    define gzclose_w             z_gzclose_w
+#    define gzdirect              z_gzdirect
+#    define gzdopen               z_gzdopen
+#    define gzeof                 z_gzeof
+#    define gzerror               z_gzerror
+#    define gzflush               z_gzflush
+#    define gzfread               z_gzfread
+#    define gzfwrite              z_gzfwrite
+#    define gzgetc                z_gzgetc
+#    define gzgetc_               z_gzgetc_
+#    define gzgets                z_gzgets
+#    define gzoffset              z_gzoffset
+#    define gzoffset64            z_gzoffset64
+#    define gzopen                z_gzopen
+#    define gzopen64              z_gzopen64
+#    ifdef _WIN32
+#      define gzopen_w              z_gzopen_w
+#    endif
+#    define gzprintf              z_gzprintf
+#    define gzputc                z_gzputc
+#    define gzputs                z_gzputs
+#    define gzread                z_gzread
+#    define gzrewind              z_gzrewind
+#    define gzseek                z_gzseek
+#    define gzseek64              z_gzseek64
+#    define gzsetparams           z_gzsetparams
+#    define gztell                z_gztell
+#    define gztell64              z_gztell64
+#    define gzungetc              z_gzungetc
+#    define gzvprintf             z_gzvprintf
+#    define gzwrite               z_gzwrite
+#  endif
+#  define inflate               z_inflate
+#  define inflateBack           z_inflateBack
+#  define inflateBackEnd        z_inflateBackEnd
+#  define inflateBackInit       z_inflateBackInit
+#  define inflateBackInit_      z_inflateBackInit_
+#  define inflateCodesUsed      z_inflateCodesUsed
+#  define inflateCopy           z_inflateCopy
+#  define inflateEnd            z_inflateEnd
+#  define inflateGetDictionary  z_inflateGetDictionary
+#  define inflateGetHeader      z_inflateGetHeader
+#  define inflateInit           z_inflateInit
+#  define inflateInit2          z_inflateInit2
+#  define inflateInit2_         z_inflateInit2_
+#  define inflateInit_          z_inflateInit_
+#  define inflateMark           z_inflateMark
+#  define inflatePrime          z_inflatePrime
+#  define inflateReset          z_inflateReset
+#  define inflateReset2         z_inflateReset2
+#  define inflateResetKeep      z_inflateResetKeep
+#  define inflateSetDictionary  z_inflateSetDictionary
+#  define inflateSync           z_inflateSync
+#  define inflateSyncPoint      z_inflateSyncPoint
+#  define inflateUndermine      z_inflateUndermine
+#  define inflateValidate       z_inflateValidate
+#  define inflate_copyright     z_inflate_copyright
+#  define inflate_fast          z_inflate_fast
+#  define inflate_table         z_inflate_table
+#  ifndef Z_SOLO
+#    define uncompress            z_uncompress
+#    define uncompress2           z_uncompress2
+#  endif
+#  define zError                z_zError
+#  ifndef Z_SOLO
+#    define zcalloc               z_zcalloc
+#    define zcfree                z_zcfree
+#  endif
+#  define zlibCompileFlags      z_zlibCompileFlags
+#  define zlibVersion           z_zlibVersion
+
+/* all zlib typedefs in zlib.h and zconf.h */
+#  define Byte                  z_Byte
+#  define Bytef                 z_Bytef
+#  define alloc_func            z_alloc_func
+#  define charf                 z_charf
+#  define free_func             z_free_func
+#  ifndef Z_SOLO
+#    define gzFile                z_gzFile
+#  endif
+#  define gz_header             z_gz_header
+#  define gz_headerp            z_gz_headerp
+#  define in_func               z_in_func
+#  define intf                  z_intf
+#  define out_func              z_out_func
+#  define uInt                  z_uInt
+#  define uIntf                 z_uIntf
+#  define uLong                 z_uLong
+#  define uLongf                z_uLongf
+#  define voidp                 z_voidp
+#  define voidpc                z_voidpc
+#  define voidpf                z_voidpf
+
+/* all zlib structs in zlib.h and zconf.h */
+#  define gz_header_s           z_gz_header_s
+#  define internal_state        z_internal_state
+
+#endif
+
+#if defined(__MSDOS__) && !defined(MSDOS)
+#  define MSDOS
+#endif
+#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2)
+#  define OS2
+#endif
+#if defined(_WINDOWS) && !defined(WINDOWS)
+#  define WINDOWS
+#endif
+#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__)
+#  ifndef WIN32
+#    define WIN32
+#  endif
+#endif
+#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32)
+#  if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__)
+#    ifndef SYS16BIT
+#      define SYS16BIT
+#    endif
+#  endif
+#endif
+
+/*
+ * Compile with -DMAXSEG_64K if the alloc function cannot allocate more
+ * than 64k bytes at a time (needed on systems with 16-bit int).
+ */
+#ifdef SYS16BIT
+#  define MAXSEG_64K
+#endif
+#ifdef MSDOS
+#  define UNALIGNED_OK
+#endif
+
+#ifdef __STDC_VERSION__
+#  ifndef STDC
+#    define STDC
+#  endif
+#  if __STDC_VERSION__ >= 199901L
+#    ifndef STDC99
+#      define STDC99
+#    endif
+#  endif
+#endif
+#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus))
+#  define STDC
+#endif
+#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__))
+#  define STDC
+#endif
+#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32))
+#  define STDC
+#endif
+#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__))
+#  define STDC
+#endif
+
+#if defined(__OS400__) && !defined(STDC)    /* iSeries (formerly AS/400). */
+#  define STDC
+#endif
+
+#ifndef STDC
+#  ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */
+#    define const       /* note: need a more gentle solution here */
+#  endif
+#endif
+
+#if defined(ZLIB_CONST) && !defined(z_const)
+#  define z_const const
+#else
+#  define z_const
+#endif
+
+#ifdef Z_SOLO
+#  ifdef _WIN64
+     typedef unsigned long long z_size_t;
+#  else
+     typedef unsigned long z_size_t;
+#  endif
+#else
+#  define z_longlong long long
+#  if defined(NO_SIZE_T)
+     typedef unsigned NO_SIZE_T z_size_t;
+#  elif defined(STDC)
+#    include <stddef.h>
+     typedef size_t z_size_t;
+#  else
+     typedef unsigned long z_size_t;
+#  endif
+#  undef z_longlong
+#endif
+
+/* Maximum value for memLevel in deflateInit2 */
+#ifndef MAX_MEM_LEVEL
+#  ifdef MAXSEG_64K
+#    define MAX_MEM_LEVEL 8
+#  else
+#    define MAX_MEM_LEVEL 9
+#  endif
+#endif
+
+/* Maximum value for windowBits in deflateInit2 and inflateInit2.
+ * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files
+ * created by gzip. (Files created by minigzip can still be extracted by
+ * gzip.)
+ */
+#ifndef MAX_WBITS
+#  define MAX_WBITS   15 /* 32K LZ77 window */
+#endif
+
+/* The memory requirements for deflate are (in bytes):
+            (1 << (windowBits+2)) +  (1 << (memLevel+9))
+ that is: 128K for windowBits=15  +  128K for memLevel = 8  (default values)
+ plus a few kilobytes for small objects. For example, if you want to reduce
+ the default memory requirements from 256K to 128K, compile with
+     make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7"
+ Of course this will generally degrade compression (there's no free lunch).
+
+   The memory requirements for inflate are (in bytes) 1 << windowBits
+ that is, 32K for windowBits=15 (default value) plus about 7 kilobytes
+ for small objects.
+*/
+
+                        /* Type declarations */
+
+#ifndef OF /* function prototypes */
+#  ifdef STDC
+#    define OF(args)  args
+#  else
+#    define OF(args)  ()
+#  endif
+#endif
+
+/* The following definitions for FAR are needed only for MSDOS mixed
+ * model programming (small or medium model with some far allocations).
+ * This was tested only with MSC; for other MSDOS compilers you may have
+ * to define NO_MEMCPY in zutil.h.  If you don't need the mixed model,
+ * just define FAR to be empty.
+ */
+#ifdef SYS16BIT
+#  if defined(M_I86SM) || defined(M_I86MM)
+     /* MSC small or medium model */
+#    define SMALL_MEDIUM
+#    ifdef _MSC_VER
+#      define FAR _far
+#    else
+#      define FAR far
+#    endif
+#  endif
+#  if (defined(__SMALL__) || defined(__MEDIUM__))
+     /* Turbo C small or medium model */
+#    define SMALL_MEDIUM
+#    ifdef __BORLANDC__
+#      define FAR _far
+#    else
+#      define FAR far
+#    endif
+#  endif
+#endif
+
+#if defined(WINDOWS) || defined(WIN32)
+   /* If building or using zlib as a DLL, define ZLIB_DLL.
+    * This is not mandatory, but it offers a little performance increase.
+    */
+#  ifdef ZLIB_DLL
+#    if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500))
+#      ifdef ZLIB_INTERNAL
+#        define ZEXTERN extern __declspec(dllexport)
+#      else
+#        define ZEXTERN extern __declspec(dllimport)
+#      endif
+#    endif
+#  endif  /* ZLIB_DLL */
+   /* If building or using zlib with the WINAPI/WINAPIV calling convention,
+    * define ZLIB_WINAPI.
+    * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI.
+    */
+#  ifdef ZLIB_WINAPI
+#    ifdef FAR
+#      undef FAR
+#    endif
+#    ifndef WIN32_LEAN_AND_MEAN
+#      define WIN32_LEAN_AND_MEAN
+#    endif
+#    include <windows.h>
+     /* No need for _export, use ZLIB.DEF instead. */
+     /* For complete Windows compatibility, use WINAPI, not __stdcall. */
+#    define ZEXPORT WINAPI
+#    ifdef WIN32
+#      define ZEXPORTVA WINAPIV
+#    else
+#      define ZEXPORTVA FAR CDECL
+#    endif
+#  endif
+#endif
+
+#if defined (__BEOS__)
+#  ifdef ZLIB_DLL
+#    ifdef ZLIB_INTERNAL
+#      define ZEXPORT   __declspec(dllexport)
+#      define ZEXPORTVA __declspec(dllexport)
+#    else
+#      define ZEXPORT   __declspec(dllimport)
+#      define ZEXPORTVA __declspec(dllimport)
+#    endif
+#  endif
+#endif
+
+#ifndef ZEXTERN
+#  define ZEXTERN extern
+#endif
+#ifndef ZEXPORT
+#  define ZEXPORT
+#endif
+#ifndef ZEXPORTVA
+#  define ZEXPORTVA
+#endif
+
+#ifndef FAR
+#  define FAR
+#endif
+
+#if !defined(__MACTYPES__)
+typedef unsigned char  Byte;  /* 8 bits */
+#endif
+typedef unsigned int   uInt;  /* 16 bits or more */
+typedef unsigned long  uLong; /* 32 bits or more */
+
+#ifdef SMALL_MEDIUM
+   /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */
+#  define Bytef Byte FAR
+#else
+   typedef Byte  FAR Bytef;
+#endif
+typedef char  FAR charf;
+typedef int   FAR intf;
+typedef uInt  FAR uIntf;
+typedef uLong FAR uLongf;
+
+#ifdef STDC
+   typedef void const *voidpc;
+   typedef void FAR   *voidpf;
+   typedef void       *voidp;
+#else
+   typedef Byte const *voidpc;
+   typedef Byte FAR   *voidpf;
+   typedef Byte       *voidp;
+#endif
+
+#if !defined(Z_U4) && !defined(Z_SOLO) && defined(STDC)
+#  include <limits.h>
+#  if (UINT_MAX == 0xffffffffUL)
+#    define Z_U4 unsigned
+#  elif (ULONG_MAX == 0xffffffffUL)
+#    define Z_U4 unsigned long
+#  elif (USHRT_MAX == 0xffffffffUL)
+#    define Z_U4 unsigned short
+#  endif
+#endif
+
+#ifdef Z_U4
+   typedef Z_U4 z_crc_t;
+#else
+   typedef unsigned long z_crc_t;
+#endif
+
+#if 1    /* was set to #if 1 by ./configure */
+#  define Z_HAVE_UNISTD_H
+#endif
+
+#if 1    /* was set to #if 1 by ./configure */
+#  define Z_HAVE_STDARG_H
+#endif
+
+#ifdef STDC
+#  ifndef Z_SOLO
+#    include <sys/types.h>      /* for off_t */
+#  endif
+#endif
+
+#if defined(STDC) || defined(Z_HAVE_STDARG_H)
+#  ifndef Z_SOLO
+#    include <stdarg.h>         /* for va_list */
+#  endif
+#endif
+
+#ifdef _WIN32
+#  ifndef Z_SOLO
+#    include <stddef.h>         /* for wchar_t */
+#  endif
+#endif
+
+/* a little trick to accommodate both "#define _LARGEFILE64_SOURCE" and
+ * "#define _LARGEFILE64_SOURCE 1" as requesting 64-bit operations, (even
+ * though the former does not conform to the LFS document), but considering
+ * both "#undef _LARGEFILE64_SOURCE" and "#define _LARGEFILE64_SOURCE 0" as
+ * equivalently requesting no 64-bit operations
+ */
+#if defined(_LARGEFILE64_SOURCE) && -_LARGEFILE64_SOURCE - -1 == 1
+#  undef _LARGEFILE64_SOURCE
+#endif
+
+#ifndef Z_HAVE_UNISTD_H
+#  ifdef __WATCOMC__
+#    define Z_HAVE_UNISTD_H
+#  endif
+#endif
+#ifndef Z_HAVE_UNISTD_H
+#  if defined(_LARGEFILE64_SOURCE) && !defined(_WIN32)
+#    define Z_HAVE_UNISTD_H
+#  endif
+#endif
+#ifndef Z_SOLO
+#  if defined(Z_HAVE_UNISTD_H)
+#    include <unistd.h>         /* for SEEK_*, off_t, and _LFS64_LARGEFILE */
+#    ifdef VMS
+#      include <unixio.h>       /* for off_t */
+#    endif
+#    ifndef z_off_t
+#      define z_off_t off_t
+#    endif
+#  endif
+#endif
+
+#if defined(_LFS64_LARGEFILE) && _LFS64_LARGEFILE-0
+#  define Z_LFS64
+#endif
+
+#if defined(_LARGEFILE64_SOURCE) && defined(Z_LFS64)
+#  define Z_LARGE64
+#endif
+
+#if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS-0 == 64 && defined(Z_LFS64)
+#  define Z_WANT64
+#endif
+
+#if !defined(SEEK_SET) && !defined(Z_SOLO)
+#  define SEEK_SET        0       /* Seek from beginning of file.  */
+#  define SEEK_CUR        1       /* Seek from current position.  */
+#  define SEEK_END        2       /* Set file pointer to EOF plus "offset" */
+#endif
+
+#ifndef z_off_t
+#  define z_off_t long
+#endif
+
+#if !defined(_WIN32) && defined(Z_LARGE64)
+#  define z_off64_t off64_t
+#else
+#  if defined(_WIN32) && !defined(__GNUC__)
+#    define z_off64_t __int64
+#  else
+#    define z_off64_t z_off_t
+#  endif
+#endif
+
+/* MVS linker does not support external names larger than 8 bytes */
+#if defined(__MVS__)
+  #pragma map(deflateInit_,"DEIN")
+  #pragma map(deflateInit2_,"DEIN2")
+  #pragma map(deflateEnd,"DEEND")
+  #pragma map(deflateBound,"DEBND")
+  #pragma map(inflateInit_,"ININ")
+  #pragma map(inflateInit2_,"ININ2")
+  #pragma map(inflateEnd,"INEND")
+  #pragma map(inflateSync,"INSY")
+  #pragma map(inflateSetDictionary,"INSEDI")
+  #pragma map(compressBound,"CMBND")
+  #pragma map(inflate_table,"INTABL")
+  #pragma map(inflate_fast,"INFA")
+  #pragma map(inflate_copyright,"INCOPY")
+#endif
+
+#endif /* ZCONF_H */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/zlib.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/zlib.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d4b932eaf6a0fbb8133b3ab49ba5ef587059fa0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/extlibs/zlib.h
@@ -0,0 +1,1938 @@
+/* zlib.h -- interface of the 'zlib' general purpose compression library
+  version 1.3.1, January 22nd, 2024
+
+  Copyright (C) 1995-2024 Jean-loup Gailly and Mark Adler
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup@gzip.org          madler@alumni.caltech.edu
+
+
+  The data format used by the zlib library is described by RFCs (Request for
+  Comments) 1950 to 1952 in the files http://tools.ietf.org/html/rfc1950
+  (zlib format), rfc1951 (deflate format) and rfc1952 (gzip format).
+*/
+
+#ifndef ZLIB_H
+#define ZLIB_H
+
+#include "zconf.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZLIB_VERSION "1.3.1"
+#define ZLIB_VERNUM 0x1310
+#define ZLIB_VER_MAJOR 1
+#define ZLIB_VER_MINOR 3
+#define ZLIB_VER_REVISION 1
+#define ZLIB_VER_SUBREVISION 0
+
+/*
+    The 'zlib' compression library provides in-memory compression and
+  decompression functions, including integrity checks of the uncompressed data.
+  This version of the library supports only one compression method (deflation)
+  but other algorithms will be added later and will have the same stream
+  interface.
+
+    Compression can be done in a single step if the buffers are large enough,
+  or can be done by repeated calls of the compression function.  In the latter
+  case, the application must provide more input and/or consume the output
+  (providing more output space) before each call.
+
+    The compressed data format used by default by the in-memory functions is
+  the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped
+  around a deflate stream, which is itself documented in RFC 1951.
+
+    The library also supports reading and writing files in gzip (.gz) format
+  with an interface similar to that of stdio using the functions that start
+  with "gz".  The gzip format is different from the zlib format.  gzip is a
+  gzip wrapper, documented in RFC 1952, wrapped around a deflate stream.
+
+    This library can optionally read and write gzip and raw deflate streams in
+  memory as well.
+
+    The zlib format was designed to be compact and fast for use in memory
+  and on communications channels.  The gzip format was designed for single-
+  file compression on file systems, has a larger header than zlib to maintain
+  directory information, and uses a different, slower check method than zlib.
+
+    The library does not install any signal handler.  The decoder checks
+  the consistency of the compressed data, so the library should never crash
+  even in the case of corrupted input.
+*/
+
+typedef voidpf (*alloc_func)(voidpf opaque, uInt items, uInt size);
+typedef void   (*free_func)(voidpf opaque, voidpf address);
+
+struct internal_state;
+
+typedef struct z_stream_s {
+    z_const Bytef *next_in;     /* next input byte */
+    uInt     avail_in;  /* number of bytes available at next_in */
+    uLong    total_in;  /* total number of input bytes read so far */
+
+    Bytef    *next_out; /* next output byte will go here */
+    uInt     avail_out; /* remaining free space at next_out */
+    uLong    total_out; /* total number of bytes output so far */
+
+    z_const char *msg;  /* last error message, NULL if no error */
+    struct internal_state FAR *state; /* not visible by applications */
+
+    alloc_func zalloc;  /* used to allocate the internal state */
+    free_func  zfree;   /* used to free the internal state */
+    voidpf     opaque;  /* private data object passed to zalloc and zfree */
+
+    int     data_type;  /* best guess about the data type: binary or text
+                           for deflate, or the decoding state for inflate */
+    uLong   adler;      /* Adler-32 or CRC-32 value of the uncompressed data */
+    uLong   reserved;   /* reserved for future use */
+} z_stream;
+
+typedef z_stream FAR *z_streamp;
+
+/*
+     gzip header information passed to and from zlib routines.  See RFC 1952
+  for more details on the meanings of these fields.
+*/
+typedef struct gz_header_s {
+    int     text;       /* true if compressed data believed to be text */
+    uLong   time;       /* modification time */
+    int     xflags;     /* extra flags (not used when writing a gzip file) */
+    int     os;         /* operating system */
+    Bytef   *extra;     /* pointer to extra field or Z_NULL if none */
+    uInt    extra_len;  /* extra field length (valid if extra != Z_NULL) */
+    uInt    extra_max;  /* space at extra (only when reading header) */
+    Bytef   *name;      /* pointer to zero-terminated file name or Z_NULL */
+    uInt    name_max;   /* space at name (only when reading header) */
+    Bytef   *comment;   /* pointer to zero-terminated comment or Z_NULL */
+    uInt    comm_max;   /* space at comment (only when reading header) */
+    int     hcrc;       /* true if there was or will be a header crc */
+    int     done;       /* true when done reading gzip header (not used
+                           when writing a gzip file) */
+} gz_header;
+
+typedef gz_header FAR *gz_headerp;
+
+/*
+     The application must update next_in and avail_in when avail_in has dropped
+   to zero.  It must update next_out and avail_out when avail_out has dropped
+   to zero.  The application must initialize zalloc, zfree and opaque before
+   calling the init function.  All other fields are set by the compression
+   library and must not be updated by the application.
+
+     The opaque value provided by the application will be passed as the first
+   parameter for calls of zalloc and zfree.  This can be useful for custom
+   memory management.  The compression library attaches no meaning to the
+   opaque value.
+
+     zalloc must return Z_NULL if there is not enough memory for the object.
+   If zlib is used in a multi-threaded application, zalloc and zfree must be
+   thread safe.  In that case, zlib is thread-safe.  When zalloc and zfree are
+   Z_NULL on entry to the initialization function, they are set to internal
+   routines that use the standard library functions malloc() and free().
+
+     On 16-bit systems, the functions zalloc and zfree must be able to allocate
+   exactly 65536 bytes, but will not be required to allocate more than this if
+   the symbol MAXSEG_64K is defined (see zconf.h).  WARNING: On MSDOS, pointers
+   returned by zalloc for objects of exactly 65536 bytes *must* have their
+   offset normalized to zero.  The default allocation function provided by this
+   library ensures this (see zutil.c).  To reduce memory requirements and avoid
+   any allocation of 64K objects, at the expense of compression ratio, compile
+   the library with -DMAX_WBITS=14 (see zconf.h).
+
+     The fields total_in and total_out can be used for statistics or progress
+   reports.  After compression, total_in holds the total size of the
+   uncompressed data and may be saved for use by the decompressor (particularly
+   if the decompressor wants to decompress everything in a single step).
+*/
+
+                        /* constants */
+
+#define Z_NO_FLUSH      0
+#define Z_PARTIAL_FLUSH 1
+#define Z_SYNC_FLUSH    2
+#define Z_FULL_FLUSH    3
+#define Z_FINISH        4
+#define Z_BLOCK         5
+#define Z_TREES         6
+/* Allowed flush values; see deflate() and inflate() below for details */
+
+#define Z_OK            0
+#define Z_STREAM_END    1
+#define Z_NEED_DICT     2
+#define Z_ERRNO        (-1)
+#define Z_STREAM_ERROR (-2)
+#define Z_DATA_ERROR   (-3)
+#define Z_MEM_ERROR    (-4)
+#define Z_BUF_ERROR    (-5)
+#define Z_VERSION_ERROR (-6)
+/* Return codes for the compression/decompression functions. Negative values
+ * are errors, positive values are used for special but normal events.
+ */
+
+#define Z_NO_COMPRESSION         0
+#define Z_BEST_SPEED             1
+#define Z_BEST_COMPRESSION       9
+#define Z_DEFAULT_COMPRESSION  (-1)
+/* compression levels */
+
+#define Z_FILTERED            1
+#define Z_HUFFMAN_ONLY        2
+#define Z_RLE                 3
+#define Z_FIXED               4
+#define Z_DEFAULT_STRATEGY    0
+/* compression strategy; see deflateInit2() below for details */
+
+#define Z_BINARY   0
+#define Z_TEXT     1
+#define Z_ASCII    Z_TEXT   /* for compatibility with 1.2.2 and earlier */
+#define Z_UNKNOWN  2
+/* Possible values of the data_type field for deflate() */
+
+#define Z_DEFLATED   8
+/* The deflate compression method (the only one supported in this version) */
+
+#define Z_NULL  0  /* for initializing zalloc, zfree, opaque */
+
+#define zlib_version zlibVersion()
+/* for compatibility with versions < 1.0.2 */
+
+
+                        /* basic functions */
+
+ZEXTERN const char * ZEXPORT zlibVersion(void);
+/* The application can compare zlibVersion and ZLIB_VERSION for consistency.
+   If the first character differs, the library code actually used is not
+   compatible with the zlib.h header file used by the application.  This check
+   is automatically made by deflateInit and inflateInit.
+ */
+
+/*
+ZEXTERN int ZEXPORT deflateInit(z_streamp strm, int level);
+
+     Initializes the internal stream state for compression.  The fields
+   zalloc, zfree and opaque must be initialized before by the caller.  If
+   zalloc and zfree are set to Z_NULL, deflateInit updates them to use default
+   allocation functions.  total_in, total_out, adler, and msg are initialized.
+
+     The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
+   1 gives best speed, 9 gives best compression, 0 gives no compression at all
+   (the input data is simply copied a block at a time).  Z_DEFAULT_COMPRESSION
+   requests a default compromise between speed and compression (currently
+   equivalent to level 6).
+
+     deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if level is not a valid compression level, or
+   Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible
+   with the version assumed by the caller (ZLIB_VERSION).  msg is set to null
+   if there is no error message.  deflateInit does not perform any compression:
+   this will be done by deflate().
+*/
+
+
+ZEXTERN int ZEXPORT deflate(z_streamp strm, int flush);
+/*
+    deflate compresses as much data as possible, and stops when the input
+  buffer becomes empty or the output buffer becomes full.  It may introduce
+  some output latency (reading input without producing any output) except when
+  forced to flush.
+
+    The detailed semantics are as follows.  deflate performs one or both of the
+  following actions:
+
+  - Compress more input starting at next_in and update next_in and avail_in
+    accordingly.  If not all input can be processed (because there is not
+    enough room in the output buffer), next_in and avail_in are updated and
+    processing will resume at this point for the next call of deflate().
+
+  - Generate more output starting at next_out and update next_out and avail_out
+    accordingly.  This action is forced if the parameter flush is non zero.
+    Forcing flush frequently degrades the compression ratio, so this parameter
+    should be set only when necessary.  Some output may be provided even if
+    flush is zero.
+
+    Before the call of deflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming more
+  output, and updating avail_in or avail_out accordingly; avail_out should
+  never be zero before the call.  The application can consume the compressed
+  output when it wants, for example when the output buffer is full (avail_out
+  == 0), or after each call of deflate().  If deflate returns Z_OK and with
+  zero avail_out, it must be called again after making room in the output
+  buffer because there might be more output pending. See deflatePending(),
+  which can be used if desired to determine whether or not there is more output
+  in that case.
+
+    Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to
+  decide how much data to accumulate before producing output, in order to
+  maximize compression.
+
+    If the parameter flush is set to Z_SYNC_FLUSH, all pending output is
+  flushed to the output buffer and the output is aligned on a byte boundary, so
+  that the decompressor can get all input data available so far.  (In
+  particular avail_in is zero after the call if enough output space has been
+  provided before the call.) Flushing may degrade compression for some
+  compression algorithms and so it should be used only when necessary.  This
+  completes the current deflate block and follows it with an empty stored block
+  that is three bits plus filler bits to the next byte, followed by four bytes
+  (00 00 ff ff).
+
+    If flush is set to Z_PARTIAL_FLUSH, all pending output is flushed to the
+  output buffer, but the output is not aligned to a byte boundary.  All of the
+  input data so far will be available to the decompressor, as for Z_SYNC_FLUSH.
+  This completes the current deflate block and follows it with an empty fixed
+  codes block that is 10 bits long.  This assures that enough bytes are output
+  in order for the decompressor to finish the block before the empty fixed
+  codes block.
+
+    If flush is set to Z_BLOCK, a deflate block is completed and emitted, as
+  for Z_SYNC_FLUSH, but the output is not aligned on a byte boundary, and up to
+  seven bits of the current block are held to be written as the next byte after
+  the next deflate block is completed.  In this case, the decompressor may not
+  be provided enough bits at this point in order to complete decompression of
+  the data provided so far to the compressor.  It may need to wait for the next
+  block to be emitted.  This is for advanced applications that need to control
+  the emission of deflate blocks.
+
+    If flush is set to Z_FULL_FLUSH, all output is flushed as with
+  Z_SYNC_FLUSH, and the compression state is reset so that decompression can
+  restart from this point if previous compressed data has been damaged or if
+  random access is desired.  Using Z_FULL_FLUSH too often can seriously degrade
+  compression.
+
+    If deflate returns with avail_out == 0, this function must be called again
+  with the same value of the flush parameter and more output space (updated
+  avail_out), until the flush is complete (deflate returns with non-zero
+  avail_out).  In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that
+  avail_out is greater than six when the flush marker begins, in order to avoid
+  repeated flush markers upon calling deflate() again when avail_out == 0.
+
+    If the parameter flush is set to Z_FINISH, pending input is processed,
+  pending output is flushed and deflate returns with Z_STREAM_END if there was
+  enough output space.  If deflate returns with Z_OK or Z_BUF_ERROR, this
+  function must be called again with Z_FINISH and more output space (updated
+  avail_out) but no more input data, until it returns with Z_STREAM_END or an
+  error.  After deflate has returned Z_STREAM_END, the only possible operations
+  on the stream are deflateReset or deflateEnd.
+
+    Z_FINISH can be used in the first deflate call after deflateInit if all the
+  compression is to be done in a single step.  In order to complete in one
+  call, avail_out must be at least the value returned by deflateBound (see
+  below).  Then deflate is guaranteed to return Z_STREAM_END.  If not enough
+  output space is provided, deflate will not return Z_STREAM_END, and it must
+  be called again as described above.
+
+    deflate() sets strm->adler to the Adler-32 checksum of all input read
+  so far (that is, total_in bytes).  If a gzip stream is being generated, then
+  strm->adler will be the CRC-32 checksum of the input read so far.  (See
+  deflateInit2 below.)
+
+    deflate() may update strm->data_type if it can make a good guess about
+  the input data type (Z_BINARY or Z_TEXT).  If in doubt, the data is
+  considered binary.  This field is only for information purposes and does not
+  affect the compression algorithm in any manner.
+
+    deflate() returns Z_OK if some progress has been made (more input
+  processed or more output produced), Z_STREAM_END if all input has been
+  consumed and all output has been produced (only when flush is set to
+  Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example
+  if next_in or next_out was Z_NULL or the state was inadvertently written over
+  by the application), or Z_BUF_ERROR if no progress is possible (for example
+  avail_in or avail_out was zero).  Note that Z_BUF_ERROR is not fatal, and
+  deflate() can be called again with more input and more output space to
+  continue compressing.
+*/
+
+
+ZEXTERN int ZEXPORT deflateEnd(z_streamp strm);
+/*
+     All dynamically allocated data structures for this stream are freed.
+   This function discards any unprocessed input and does not flush any pending
+   output.
+
+     deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the
+   stream state was inconsistent, Z_DATA_ERROR if the stream was freed
+   prematurely (some input or output was discarded).  In the error case, msg
+   may be set but then points to a static string (which must not be
+   deallocated).
+*/
+
+
+/*
+ZEXTERN int ZEXPORT inflateInit(z_streamp strm);
+
+     Initializes the internal stream state for decompression.  The fields
+   next_in, avail_in, zalloc, zfree and opaque must be initialized before by
+   the caller.  In the current version of inflate, the provided input is not
+   read or consumed.  The allocation of a sliding window will be deferred to
+   the first call of inflate (if the decompression does not complete on the
+   first call).  If zalloc and zfree are set to Z_NULL, inflateInit updates
+   them to use default allocation functions.  total_in, total_out, adler, and
+   msg are initialized.
+
+     inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+   version assumed by the caller, or Z_STREAM_ERROR if the parameters are
+   invalid, such as a null pointer to the structure.  msg is set to null if
+   there is no error message.  inflateInit does not perform any decompression.
+   Actual decompression will be done by inflate().  So next_in, and avail_in,
+   next_out, and avail_out are unused and unchanged.  The current
+   implementation of inflateInit() does not process any header information --
+   that is deferred until inflate() is called.
+*/
+
+
+ZEXTERN int ZEXPORT inflate(z_streamp strm, int flush);
+/*
+    inflate decompresses as much data as possible, and stops when the input
+  buffer becomes empty or the output buffer becomes full.  It may introduce
+  some output latency (reading input without producing any output) except when
+  forced to flush.
+
+  The detailed semantics are as follows.  inflate performs one or both of the
+  following actions:
+
+  - Decompress more input starting at next_in and update next_in and avail_in
+    accordingly.  If not all input can be processed (because there is not
+    enough room in the output buffer), then next_in and avail_in are updated
+    accordingly, and processing will resume at this point for the next call of
+    inflate().
+
+  - Generate more output starting at next_out and update next_out and avail_out
+    accordingly.  inflate() provides as much output as possible, until there is
+    no more input data or no more space in the output buffer (see below about
+    the flush parameter).
+
+    Before the call of inflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming more
+  output, and updating the next_* and avail_* values accordingly.  If the
+  caller of inflate() does not provide both available input and available
+  output space, it is possible that there will be no progress made.  The
+  application can consume the uncompressed output when it wants, for example
+  when the output buffer is full (avail_out == 0), or after each call of
+  inflate().  If inflate returns Z_OK and with zero avail_out, it must be
+  called again after making room in the output buffer because there might be
+  more output pending.
+
+    The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, Z_FINISH,
+  Z_BLOCK, or Z_TREES.  Z_SYNC_FLUSH requests that inflate() flush as much
+  output as possible to the output buffer.  Z_BLOCK requests that inflate()
+  stop if and when it gets to the next deflate block boundary.  When decoding
+  the zlib or gzip format, this will cause inflate() to return immediately
+  after the header and before the first block.  When doing a raw inflate,
+  inflate() will go ahead and process the first block, and will return when it
+  gets to the end of that block, or when it runs out of data.
+
+    The Z_BLOCK option assists in appending to or combining deflate streams.
+  To assist in this, on return inflate() always sets strm->data_type to the
+  number of unused bits in the last byte taken from strm->next_in, plus 64 if
+  inflate() is currently decoding the last block in the deflate stream, plus
+  128 if inflate() returned immediately after decoding an end-of-block code or
+  decoding the complete header up to just before the first byte of the deflate
+  stream.  The end-of-block will not be indicated until all of the uncompressed
+  data from that block has been written to strm->next_out.  The number of
+  unused bits may in general be greater than seven, except when bit 7 of
+  data_type is set, in which case the number of unused bits will be less than
+  eight.  data_type is set as noted here every time inflate() returns for all
+  flush options, and so can be used to determine the amount of currently
+  consumed input in bits.
+
+    The Z_TREES option behaves as Z_BLOCK does, but it also returns when the
+  end of each deflate block header is reached, before any actual data in that
+  block is decoded.  This allows the caller to determine the length of the
+  deflate block header for later use in random access within a deflate block.
+  256 is added to the value of strm->data_type when inflate() returns
+  immediately after reaching the end of the deflate block header.
+
+    inflate() should normally be called until it returns Z_STREAM_END or an
+  error.  However if all decompression is to be performed in a single step (a
+  single call of inflate), the parameter flush should be set to Z_FINISH.  In
+  this case all pending input is processed and all pending output is flushed;
+  avail_out must be large enough to hold all of the uncompressed data for the
+  operation to complete.  (The size of the uncompressed data may have been
+  saved by the compressor for this purpose.)  The use of Z_FINISH is not
+  required to perform an inflation in one step.  However it may be used to
+  inform inflate that a faster approach can be used for the single inflate()
+  call.  Z_FINISH also informs inflate to not maintain a sliding window if the
+  stream completes, which reduces inflate's memory footprint.  If the stream
+  does not complete, either because not all of the stream is provided or not
+  enough output space is provided, then a sliding window will be allocated and
+  inflate() can be called again to continue the operation as if Z_NO_FLUSH had
+  been used.
+
+     In this implementation, inflate() always flushes as much output as
+  possible to the output buffer, and always uses the faster approach on the
+  first call.  So the effects of the flush parameter in this implementation are
+  on the return value of inflate() as noted below, when inflate() returns early
+  when Z_BLOCK or Z_TREES is used, and when inflate() avoids the allocation of
+  memory for a sliding window when Z_FINISH is used.
+
+     If a preset dictionary is needed after this call (see inflateSetDictionary
+  below), inflate sets strm->adler to the Adler-32 checksum of the dictionary
+  chosen by the compressor and returns Z_NEED_DICT; otherwise it sets
+  strm->adler to the Adler-32 checksum of all output produced so far (that is,
+  total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described
+  below.  At the end of the stream, inflate() checks that its computed Adler-32
+  checksum is equal to that saved by the compressor and returns Z_STREAM_END
+  only if the checksum is correct.
+
+    inflate() can decompress and check either zlib-wrapped or gzip-wrapped
+  deflate data.  The header type is detected automatically, if requested when
+  initializing with inflateInit2().  Any information contained in the gzip
+  header is not retained unless inflateGetHeader() is used.  When processing
+  gzip-wrapped deflate data, strm->adler32 is set to the CRC-32 of the output
+  produced so far.  The CRC-32 is checked against the gzip trailer, as is the
+  uncompressed length, modulo 2^32.
+
+    inflate() returns Z_OK if some progress has been made (more input processed
+  or more output produced), Z_STREAM_END if the end of the compressed data has
+  been reached and all uncompressed output has been produced, Z_NEED_DICT if a
+  preset dictionary is needed at this point, Z_DATA_ERROR if the input data was
+  corrupted (input stream not conforming to the zlib format or incorrect check
+  value, in which case strm->msg points to a string with a more specific
+  error), Z_STREAM_ERROR if the stream structure was inconsistent (for example
+  next_in or next_out was Z_NULL, or the state was inadvertently written over
+  by the application), Z_MEM_ERROR if there was not enough memory, Z_BUF_ERROR
+  if no progress was possible or if there was not enough room in the output
+  buffer when Z_FINISH is used.  Note that Z_BUF_ERROR is not fatal, and
+  inflate() can be called again with more input and more output space to
+  continue decompressing.  If Z_DATA_ERROR is returned, the application may
+  then call inflateSync() to look for a good compression block if a partial
+  recovery of the data is to be attempted.
+*/
+
+
+ZEXTERN int ZEXPORT inflateEnd(z_streamp strm);
+/*
+     All dynamically allocated data structures for this stream are freed.
+   This function discards any unprocessed input and does not flush any pending
+   output.
+
+     inflateEnd returns Z_OK if success, or Z_STREAM_ERROR if the stream state
+   was inconsistent.
+*/
+
+
+                        /* Advanced functions */
+
+/*
+    The following functions are needed only in some special applications.
+*/
+
+/*
+ZEXTERN int ZEXPORT deflateInit2(z_streamp strm,
+                                 int level,
+                                 int method,
+                                 int windowBits,
+                                 int memLevel,
+                                 int strategy);
+
+     This is another version of deflateInit with more compression options.  The
+   fields zalloc, zfree and opaque must be initialized before by the caller.
+
+     The method parameter is the compression method.  It must be Z_DEFLATED in
+   this version of the library.
+
+     The windowBits parameter is the base two logarithm of the window size
+   (the size of the history buffer).  It should be in the range 8..15 for this
+   version of the library.  Larger values of this parameter result in better
+   compression at the expense of memory usage.  The default value is 15 if
+   deflateInit is used instead.
+
+     For the current implementation of deflate(), a windowBits value of 8 (a
+   window size of 256 bytes) is not supported.  As a result, a request for 8
+   will result in 9 (a 512-byte window).  In that case, providing 8 to
+   inflateInit2() will result in an error when the zlib header with 9 is
+   checked against the initialization of inflate().  The remedy is to not use 8
+   with deflateInit2() with this initialization, or at least in that case use 9
+   with inflateInit2().
+
+     windowBits can also be -8..-15 for raw deflate.  In this case, -windowBits
+   determines the window size.  deflate() will then generate raw deflate data
+   with no zlib header or trailer, and will not compute a check value.
+
+     windowBits can also be greater than 15 for optional gzip encoding.  Add
+   16 to windowBits to write a simple gzip header and trailer around the
+   compressed data instead of a zlib wrapper.  The gzip header will have no
+   file name, no extra data, no comment, no modification time (set to zero), no
+   header crc, and the operating system will be set to the appropriate value,
+   if the operating system was determined at compile time.  If a gzip stream is
+   being written, strm->adler is a CRC-32 instead of an Adler-32.
+
+     For raw deflate or gzip encoding, a request for a 256-byte window is
+   rejected as invalid, since only the zlib header provides a means of
+   transmitting the window size to the decompressor.
+
+     The memLevel parameter specifies how much memory should be allocated
+   for the internal compression state.  memLevel=1 uses minimum memory but is
+   slow and reduces compression ratio; memLevel=9 uses maximum memory for
+   optimal speed.  The default value is 8.  See zconf.h for total memory usage
+   as a function of windowBits and memLevel.
+
+     The strategy parameter is used to tune the compression algorithm.  Use the
+   value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a
+   filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no
+   string match), or Z_RLE to limit match distances to one (run-length
+   encoding).  Filtered data consists mostly of small values with a somewhat
+   random distribution.  In this case, the compression algorithm is tuned to
+   compress them better.  The effect of Z_FILTERED is to force more Huffman
+   coding and less string matching; it is somewhat intermediate between
+   Z_DEFAULT_STRATEGY and Z_HUFFMAN_ONLY.  Z_RLE is designed to be almost as
+   fast as Z_HUFFMAN_ONLY, but give better compression for PNG image data.  The
+   strategy parameter only affects the compression ratio but not the
+   correctness of the compressed output even if it is not set appropriately.
+   Z_FIXED prevents the use of dynamic Huffman codes, allowing for a simpler
+   decoder for special applications.
+
+     deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if any parameter is invalid (such as an invalid
+   method), or Z_VERSION_ERROR if the zlib library version (zlib_version) is
+   incompatible with the version assumed by the caller (ZLIB_VERSION).  msg is
+   set to null if there is no error message.  deflateInit2 does not perform any
+   compression: this will be done by deflate().
+*/
+
+ZEXTERN int ZEXPORT deflateSetDictionary(z_streamp strm,
+                                         const Bytef *dictionary,
+                                         uInt  dictLength);
+/*
+     Initializes the compression dictionary from the given byte sequence
+   without producing any compressed output.  When using the zlib format, this
+   function must be called immediately after deflateInit, deflateInit2 or
+   deflateReset, and before any call of deflate.  When doing raw deflate, this
+   function must be called either before any call of deflate, or immediately
+   after the completion of a deflate block, i.e. after all input has been
+   consumed and all output has been delivered when using any of the flush
+   options Z_BLOCK, Z_PARTIAL_FLUSH, Z_SYNC_FLUSH, or Z_FULL_FLUSH.  The
+   compressor and decompressor must use exactly the same dictionary (see
+   inflateSetDictionary).
+
+     The dictionary should consist of strings (byte sequences) that are likely
+   to be encountered later in the data to be compressed, with the most commonly
+   used strings preferably put towards the end of the dictionary.  Using a
+   dictionary is most useful when the data to be compressed is short and can be
+   predicted with good accuracy; the data can then be compressed better than
+   with the default empty dictionary.
+
+     Depending on the size of the compression data structures selected by
+   deflateInit or deflateInit2, a part of the dictionary may in effect be
+   discarded, for example if the dictionary is larger than the window size
+   provided in deflateInit or deflateInit2.  Thus the strings most likely to be
+   useful should be put at the end of the dictionary, not at the front.  In
+   addition, the current implementation of deflate will use at most the window
+   size minus 262 bytes of the provided dictionary.
+
+     Upon return of this function, strm->adler is set to the Adler-32 value
+   of the dictionary; the decompressor may later use this value to determine
+   which dictionary has been used by the compressor.  (The Adler-32 value
+   applies to the whole dictionary even if only a subset of the dictionary is
+   actually used by the compressor.) If a raw deflate was requested, then the
+   Adler-32 value is not computed and strm->adler is not set.
+
+     deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a
+   parameter is invalid (e.g.  dictionary being Z_NULL) or the stream state is
+   inconsistent (for example if deflate has already been called for this stream
+   or if not at a block boundary for raw deflate).  deflateSetDictionary does
+   not perform any compression: this will be done by deflate().
+*/
+
+ZEXTERN int ZEXPORT deflateGetDictionary(z_streamp strm,
+                                         Bytef *dictionary,
+                                         uInt  *dictLength);
+/*
+     Returns the sliding dictionary being maintained by deflate.  dictLength is
+   set to the number of bytes in the dictionary, and that many bytes are copied
+   to dictionary.  dictionary must have enough space, where 32768 bytes is
+   always enough.  If deflateGetDictionary() is called with dictionary equal to
+   Z_NULL, then only the dictionary length is returned, and nothing is copied.
+   Similarly, if dictLength is Z_NULL, then it is not set.
+
+     deflateGetDictionary() may return a length less than the window size, even
+   when more than the window size in input has been provided. It may return up
+   to 258 bytes less in that case, due to how zlib's implementation of deflate
+   manages the sliding window and lookahead for matches, where matches can be
+   up to 258 bytes long. If the application needs the last window-size bytes of
+   input, then that would need to be saved by the application outside of zlib.
+
+     deflateGetDictionary returns Z_OK on success, or Z_STREAM_ERROR if the
+   stream state is inconsistent.
+*/
+
+ZEXTERN int ZEXPORT deflateCopy(z_streamp dest,
+                                z_streamp source);
+/*
+     Sets the destination stream as a complete copy of the source stream.
+
+     This function can be useful when several compression strategies will be
+   tried, for example when there are several ways of pre-processing the input
+   data with a filter.  The streams that will be discarded should then be freed
+   by calling deflateEnd.  Note that deflateCopy duplicates the internal
+   compression state which can be quite large, so this strategy is slow and can
+   consume lots of memory.
+
+     deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+   (such as zalloc being Z_NULL).  msg is left unchanged in both source and
+   destination.
+*/
+
+ZEXTERN int ZEXPORT deflateReset(z_streamp strm);
+/*
+     This function is equivalent to deflateEnd followed by deflateInit, but
+   does not free and reallocate the internal compression state.  The stream
+   will leave the compression level and any other attributes that may have been
+   set unchanged.  total_in, total_out, adler, and msg are initialized.
+
+     deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being Z_NULL).
+*/
+
+ZEXTERN int ZEXPORT deflateParams(z_streamp strm,
+                                  int level,
+                                  int strategy);
+/*
+     Dynamically update the compression level and compression strategy.  The
+   interpretation of level and strategy is as in deflateInit2().  This can be
+   used to switch between compression and straight copy of the input data, or
+   to switch to a different kind of input data requiring a different strategy.
+   If the compression approach (which is a function of the level) or the
+   strategy is changed, and if there have been any deflate() calls since the
+   state was initialized or reset, then the input available so far is
+   compressed with the old level and strategy using deflate(strm, Z_BLOCK).
+   There are three approaches for the compression levels 0, 1..3, and 4..9
+   respectively.  The new level and strategy will take effect at the next call
+   of deflate().
+
+     If a deflate(strm, Z_BLOCK) is performed by deflateParams(), and it does
+   not have enough output space to complete, then the parameter change will not
+   take effect.  In this case, deflateParams() can be called again with the
+   same parameters and more output space to try again.
+
+     In order to assure a change in the parameters on the first try, the
+   deflate stream should be flushed using deflate() with Z_BLOCK or other flush
+   request until strm.avail_out is not zero, before calling deflateParams().
+   Then no more input data should be provided before the deflateParams() call.
+   If this is done, the old level and strategy will be applied to the data
+   compressed before deflateParams(), and the new level and strategy will be
+   applied to the data compressed after deflateParams().
+
+     deflateParams returns Z_OK on success, Z_STREAM_ERROR if the source stream
+   state was inconsistent or if a parameter was invalid, or Z_BUF_ERROR if
+   there was not enough output space to complete the compression of the
+   available input data before a change in the strategy or approach.  Note that
+   in the case of a Z_BUF_ERROR, the parameters are not changed.  A return
+   value of Z_BUF_ERROR is not fatal, in which case deflateParams() can be
+   retried with more output space.
+*/
+
+ZEXTERN int ZEXPORT deflateTune(z_streamp strm,
+                                int good_length,
+                                int max_lazy,
+                                int nice_length,
+                                int max_chain);
+/*
+     Fine tune deflate's internal compression parameters.  This should only be
+   used by someone who understands the algorithm used by zlib's deflate for
+   searching for the best matching string, and even then only by the most
+   fanatic optimizer trying to squeeze out the last compressed bit for their
+   specific input data.  Read the deflate.c source code for the meaning of the
+   max_lazy, good_length, nice_length, and max_chain parameters.
+
+     deflateTune() can be called after deflateInit() or deflateInit2(), and
+   returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream.
+ */
+
+ZEXTERN uLong ZEXPORT deflateBound(z_streamp strm,
+                                   uLong sourceLen);
+/*
+     deflateBound() returns an upper bound on the compressed size after
+   deflation of sourceLen bytes.  It must be called after deflateInit() or
+   deflateInit2(), and after deflateSetHeader(), if used.  This would be used
+   to allocate an output buffer for deflation in a single pass, and so would be
+   called before deflate().  If that first deflate() call is provided the
+   sourceLen input bytes, an output buffer allocated to the size returned by
+   deflateBound(), and the flush value Z_FINISH, then deflate() is guaranteed
+   to return Z_STREAM_END.  Note that it is possible for the compressed size to
+   be larger than the value returned by deflateBound() if flush options other
+   than Z_FINISH or Z_NO_FLUSH are used.
+*/
+
+ZEXTERN int ZEXPORT deflatePending(z_streamp strm,
+                                   unsigned *pending,
+                                   int *bits);
+/*
+     deflatePending() returns the number of bytes and bits of output that have
+   been generated, but not yet provided in the available output.  The bytes not
+   provided would be due to the available output space having being consumed.
+   The number of bits of output not provided are between 0 and 7, where they
+   await more bits to join them in order to fill out a full byte.  If pending
+   or bits are Z_NULL, then those values are not set.
+
+     deflatePending returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+ */
+
+ZEXTERN int ZEXPORT deflatePrime(z_streamp strm,
+                                 int bits,
+                                 int value);
+/*
+     deflatePrime() inserts bits in the deflate output stream.  The intent
+   is that this function is used to start off the deflate output with the bits
+   leftover from a previous deflate stream when appending to it.  As such, this
+   function can only be used for raw deflate, and must be used before the first
+   deflate() call after a deflateInit2() or deflateReset().  bits must be less
+   than or equal to 16, and that many of the least significant bits of value
+   will be inserted in the output.
+
+     deflatePrime returns Z_OK if success, Z_BUF_ERROR if there was not enough
+   room in the internal buffer to insert the bits, or Z_STREAM_ERROR if the
+   source stream state was inconsistent.
+*/
+
+ZEXTERN int ZEXPORT deflateSetHeader(z_streamp strm,
+                                     gz_headerp head);
+/*
+     deflateSetHeader() provides gzip header information for when a gzip
+   stream is requested by deflateInit2().  deflateSetHeader() may be called
+   after deflateInit2() or deflateReset() and before the first call of
+   deflate().  The text, time, os, extra field, name, and comment information
+   in the provided gz_header structure are written to the gzip header (xflag is
+   ignored -- the extra flags are set according to the compression level).  The
+   caller must assure that, if not Z_NULL, name and comment are terminated with
+   a zero byte, and that if extra is not Z_NULL, that extra_len bytes are
+   available there.  If hcrc is true, a gzip header crc is included.  Note that
+   the current versions of the command-line version of gzip (up through version
+   1.3.x) do not support header crc's, and will report that it is a "multi-part
+   gzip file" and give up.
+
+     If deflateSetHeader is not used, the default gzip header has text false,
+   the time set to zero, and os set to the current operating system, with no
+   extra, name, or comment fields.  The gzip header is returned to the default
+   state by deflateReset().
+
+     deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+/*
+ZEXTERN int ZEXPORT inflateInit2(z_streamp strm,
+                                 int windowBits);
+
+     This is another version of inflateInit with an extra parameter.  The
+   fields next_in, avail_in, zalloc, zfree and opaque must be initialized
+   before by the caller.
+
+     The windowBits parameter is the base two logarithm of the maximum window
+   size (the size of the history buffer).  It should be in the range 8..15 for
+   this version of the library.  The default value is 15 if inflateInit is used
+   instead.  windowBits must be greater than or equal to the windowBits value
+   provided to deflateInit2() while compressing, or it must be equal to 15 if
+   deflateInit2() was not used.  If a compressed stream with a larger window
+   size is given as input, inflate() will return with the error code
+   Z_DATA_ERROR instead of trying to allocate a larger window.
+
+     windowBits can also be zero to request that inflate use the window size in
+   the zlib header of the compressed stream.
+
+     windowBits can also be -8..-15 for raw inflate.  In this case, -windowBits
+   determines the window size.  inflate() will then process raw deflate data,
+   not looking for a zlib or gzip header, not generating a check value, and not
+   looking for any check values for comparison at the end of the stream.  This
+   is for use with other formats that use the deflate compressed data format
+   such as zip.  Those formats provide their own check values.  If a custom
+   format is developed using the raw deflate format for compressed data, it is
+   recommended that a check value such as an Adler-32 or a CRC-32 be applied to
+   the uncompressed data as is done in the zlib, gzip, and zip formats.  For
+   most applications, the zlib format should be used as is.  Note that comments
+   above on the use in deflateInit2() applies to the magnitude of windowBits.
+
+     windowBits can also be greater than 15 for optional gzip decoding.  Add
+   32 to windowBits to enable zlib and gzip decoding with automatic header
+   detection, or add 16 to decode only the gzip format (the zlib format will
+   return a Z_DATA_ERROR).  If a gzip stream is being decoded, strm->adler is a
+   CRC-32 instead of an Adler-32.  Unlike the gunzip utility and gzread() (see
+   below), inflate() will *not* automatically decode concatenated gzip members.
+   inflate() will return Z_STREAM_END at the end of the gzip member.  The state
+   would need to be reset to continue decoding a subsequent gzip member.  This
+   *must* be done if there is more data after a gzip member, in order for the
+   decompression to be compliant with the gzip standard (RFC 1952).
+
+     inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+   version assumed by the caller, or Z_STREAM_ERROR if the parameters are
+   invalid, such as a null pointer to the structure.  msg is set to null if
+   there is no error message.  inflateInit2 does not perform any decompression
+   apart from possibly reading the zlib header if present: actual decompression
+   will be done by inflate().  (So next_in and avail_in may be modified, but
+   next_out and avail_out are unused and unchanged.) The current implementation
+   of inflateInit2() does not process any header information -- that is
+   deferred until inflate() is called.
+*/
+
+ZEXTERN int ZEXPORT inflateSetDictionary(z_streamp strm,
+                                         const Bytef *dictionary,
+                                         uInt  dictLength);
+/*
+     Initializes the decompression dictionary from the given uncompressed byte
+   sequence.  This function must be called immediately after a call of inflate,
+   if that call returned Z_NEED_DICT.  The dictionary chosen by the compressor
+   can be determined from the Adler-32 value returned by that call of inflate.
+   The compressor and decompressor must use exactly the same dictionary (see
+   deflateSetDictionary).  For raw inflate, this function can be called at any
+   time to set the dictionary.  If the provided dictionary is smaller than the
+   window and there is already data in the window, then the provided dictionary
+   will amend what's there.  The application must insure that the dictionary
+   that was used for compression is provided.
+
+     inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a
+   parameter is invalid (e.g.  dictionary being Z_NULL) or the stream state is
+   inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the
+   expected one (incorrect Adler-32 value).  inflateSetDictionary does not
+   perform any decompression: this will be done by subsequent calls of
+   inflate().
+*/
+
+ZEXTERN int ZEXPORT inflateGetDictionary(z_streamp strm,
+                                         Bytef *dictionary,
+                                         uInt  *dictLength);
+/*
+     Returns the sliding dictionary being maintained by inflate.  dictLength is
+   set to the number of bytes in the dictionary, and that many bytes are copied
+   to dictionary.  dictionary must have enough space, where 32768 bytes is
+   always enough.  If inflateGetDictionary() is called with dictionary equal to
+   Z_NULL, then only the dictionary length is returned, and nothing is copied.
+   Similarly, if dictLength is Z_NULL, then it is not set.
+
+     inflateGetDictionary returns Z_OK on success, or Z_STREAM_ERROR if the
+   stream state is inconsistent.
+*/
+
+ZEXTERN int ZEXPORT inflateSync(z_streamp strm);
+/*
+     Skips invalid compressed data until a possible full flush point (see above
+   for the description of deflate with Z_FULL_FLUSH) can be found, or until all
+   available input is skipped.  No output is provided.
+
+     inflateSync searches for a 00 00 FF FF pattern in the compressed data.
+   All full flush points have this pattern, but not all occurrences of this
+   pattern are full flush points.
+
+     inflateSync returns Z_OK if a possible full flush point has been found,
+   Z_BUF_ERROR if no more input was provided, Z_DATA_ERROR if no flush point
+   has been found, or Z_STREAM_ERROR if the stream structure was inconsistent.
+   In the success case, the application may save the current value of total_in
+   which indicates where valid compressed data was found.  In the error case,
+   the application may repeatedly call inflateSync, providing more input each
+   time, until success or end of the input data.
+*/
+
+ZEXTERN int ZEXPORT inflateCopy(z_streamp dest,
+                                z_streamp source);
+/*
+     Sets the destination stream as a complete copy of the source stream.
+
+     This function can be useful when randomly accessing a large stream.  The
+   first pass through the stream can periodically record the inflate state,
+   allowing restarting inflate at those points when randomly accessing the
+   stream.
+
+     inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
+   (such as zalloc being Z_NULL).  msg is left unchanged in both source and
+   destination.
+*/
+
+ZEXTERN int ZEXPORT inflateReset(z_streamp strm);
+/*
+     This function is equivalent to inflateEnd followed by inflateInit,
+   but does not free and reallocate the internal decompression state.  The
+   stream will keep attributes that may have been set by inflateInit2.
+   total_in, total_out, adler, and msg are initialized.
+
+     inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being Z_NULL).
+*/
+
+ZEXTERN int ZEXPORT inflateReset2(z_streamp strm,
+                                  int windowBits);
+/*
+     This function is the same as inflateReset, but it also permits changing
+   the wrap and window size requests.  The windowBits parameter is interpreted
+   the same as it is for inflateInit2.  If the window size is changed, then the
+   memory allocated for the window is freed, and the window will be reallocated
+   by inflate() if needed.
+
+     inflateReset2 returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being Z_NULL), or if
+   the windowBits parameter is invalid.
+*/
+
+ZEXTERN int ZEXPORT inflatePrime(z_streamp strm,
+                                 int bits,
+                                 int value);
+/*
+     This function inserts bits in the inflate input stream.  The intent is
+   that this function is used to start inflating at a bit position in the
+   middle of a byte.  The provided bits will be used before any bytes are used
+   from next_in.  This function should only be used with raw inflate, and
+   should be used before the first inflate() call after inflateInit2() or
+   inflateReset().  bits must be less than or equal to 16, and that many of the
+   least significant bits of value will be inserted in the input.
+
+     If bits is negative, then the input stream bit buffer is emptied.  Then
+   inflatePrime() can be called again to put bits in the buffer.  This is used
+   to clear out bits leftover after feeding inflate a block description prior
+   to feeding inflate codes.
+
+     inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+ZEXTERN long ZEXPORT inflateMark(z_streamp strm);
+/*
+     This function returns two values, one in the lower 16 bits of the return
+   value, and the other in the remaining upper bits, obtained by shifting the
+   return value down 16 bits.  If the upper value is -1 and the lower value is
+   zero, then inflate() is currently decoding information outside of a block.
+   If the upper value is -1 and the lower value is non-zero, then inflate is in
+   the middle of a stored block, with the lower value equaling the number of
+   bytes from the input remaining to copy.  If the upper value is not -1, then
+   it is the number of bits back from the current bit position in the input of
+   the code (literal or length/distance pair) currently being processed.  In
+   that case the lower value is the number of bytes already emitted for that
+   code.
+
+     A code is being processed if inflate is waiting for more input to complete
+   decoding of the code, or if it has completed decoding but is waiting for
+   more output space to write the literal or match data.
+
+     inflateMark() is used to mark locations in the input data for random
+   access, which may be at bit positions, and to note those cases where the
+   output of a code may span boundaries of random access blocks.  The current
+   location in the input stream can be determined from avail_in and data_type
+   as noted in the description for the Z_BLOCK flush parameter for inflate.
+
+     inflateMark returns the value noted above, or -65536 if the provided
+   source stream state was inconsistent.
+*/
+
+ZEXTERN int ZEXPORT inflateGetHeader(z_streamp strm,
+                                     gz_headerp head);
+/*
+     inflateGetHeader() requests that gzip header information be stored in the
+   provided gz_header structure.  inflateGetHeader() may be called after
+   inflateInit2() or inflateReset(), and before the first call of inflate().
+   As inflate() processes the gzip stream, head->done is zero until the header
+   is completed, at which time head->done is set to one.  If a zlib stream is
+   being decoded, then head->done is set to -1 to indicate that there will be
+   no gzip header information forthcoming.  Note that Z_BLOCK or Z_TREES can be
+   used to force inflate() to return immediately after header processing is
+   complete and before any actual data is decompressed.
+
+     The text, time, xflags, and os fields are filled in with the gzip header
+   contents.  hcrc is set to true if there is a header CRC.  (The header CRC
+   was valid if done is set to one.) If extra is not Z_NULL, then extra_max
+   contains the maximum number of bytes to write to extra.  Once done is true,
+   extra_len contains the actual extra field length, and extra contains the
+   extra field, or that field truncated if extra_max is less than extra_len.
+   If name is not Z_NULL, then up to name_max characters are written there,
+   terminated with a zero unless the length is greater than name_max.  If
+   comment is not Z_NULL, then up to comm_max characters are written there,
+   terminated with a zero unless the length is greater than comm_max.  When any
+   of extra, name, or comment are not Z_NULL and the respective field is not
+   present in the header, then that field is set to Z_NULL to signal its
+   absence.  This allows the use of deflateSetHeader() with the returned
+   structure to duplicate the header.  However if those fields are set to
+   allocated memory, then the application will need to save those pointers
+   elsewhere so that they can be eventually freed.
+
+     If inflateGetHeader is not used, then the header information is simply
+   discarded.  The header is always checked for validity, including the header
+   CRC if present.  inflateReset() will reset the process to discard the header
+   information.  The application would need to call inflateGetHeader() again to
+   retrieve the header from the next gzip stream.
+
+     inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+*/
+
+/*
+ZEXTERN int ZEXPORT inflateBackInit(z_streamp strm, int windowBits,
+                                    unsigned char FAR *window);
+
+     Initialize the internal stream state for decompression using inflateBack()
+   calls.  The fields zalloc, zfree and opaque in strm must be initialized
+   before the call.  If zalloc and zfree are Z_NULL, then the default library-
+   derived memory allocation routines are used.  windowBits is the base two
+   logarithm of the window size, in the range 8..15.  window is a caller
+   supplied buffer of that size.  Except for special applications where it is
+   assured that deflate was used with small window sizes, windowBits must be 15
+   and a 32K byte window must be supplied to be able to decompress general
+   deflate streams.
+
+     See inflateBack() for the usage of these routines.
+
+     inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of
+   the parameters are invalid, Z_MEM_ERROR if the internal state could not be
+   allocated, or Z_VERSION_ERROR if the version of the library does not match
+   the version of the header file.
+*/
+
+typedef unsigned (*in_func)(void FAR *,
+                            z_const unsigned char FAR * FAR *);
+typedef int (*out_func)(void FAR *, unsigned char FAR *, unsigned);
+
+ZEXTERN int ZEXPORT inflateBack(z_streamp strm,
+                                in_func in, void FAR *in_desc,
+                                out_func out, void FAR *out_desc);
+/*
+     inflateBack() does a raw inflate with a single call using a call-back
+   interface for input and output.  This is potentially more efficient than
+   inflate() for file i/o applications, in that it avoids copying between the
+   output and the sliding window by simply making the window itself the output
+   buffer.  inflate() can be faster on modern CPUs when used with large
+   buffers.  inflateBack() trusts the application to not change the output
+   buffer passed by the output function, at least until inflateBack() returns.
+
+     inflateBackInit() must be called first to allocate the internal state
+   and to initialize the state with the user-provided window buffer.
+   inflateBack() may then be used multiple times to inflate a complete, raw
+   deflate stream with each call.  inflateBackEnd() is then called to free the
+   allocated state.
+
+     A raw deflate stream is one with no zlib or gzip header or trailer.
+   This routine would normally be used in a utility that reads zip or gzip
+   files and writes out uncompressed files.  The utility would decode the
+   header and process the trailer on its own, hence this routine expects only
+   the raw deflate stream to decompress.  This is different from the default
+   behavior of inflate(), which expects a zlib header and trailer around the
+   deflate stream.
+
+     inflateBack() uses two subroutines supplied by the caller that are then
+   called by inflateBack() for input and output.  inflateBack() calls those
+   routines until it reads a complete deflate stream and writes out all of the
+   uncompressed data, or until it encounters an error.  The function's
+   parameters and return types are defined above in the in_func and out_func
+   typedefs.  inflateBack() will call in(in_desc, &buf) which should return the
+   number of bytes of provided input, and a pointer to that input in buf.  If
+   there is no input available, in() must return zero -- buf is ignored in that
+   case -- and inflateBack() will return a buffer error.  inflateBack() will
+   call out(out_desc, buf, len) to write the uncompressed data buf[0..len-1].
+   out() should return zero on success, or non-zero on failure.  If out()
+   returns non-zero, inflateBack() will return with an error.  Neither in() nor
+   out() are permitted to change the contents of the window provided to
+   inflateBackInit(), which is also the buffer that out() uses to write from.
+   The length written by out() will be at most the window size.  Any non-zero
+   amount of input may be provided by in().
+
+     For convenience, inflateBack() can be provided input on the first call by
+   setting strm->next_in and strm->avail_in.  If that input is exhausted, then
+   in() will be called.  Therefore strm->next_in must be initialized before
+   calling inflateBack().  If strm->next_in is Z_NULL, then in() will be called
+   immediately for input.  If strm->next_in is not Z_NULL, then strm->avail_in
+   must also be initialized, and then if strm->avail_in is not zero, input will
+   initially be taken from strm->next_in[0 ..  strm->avail_in - 1].
+
+     The in_desc and out_desc parameters of inflateBack() is passed as the
+   first parameter of in() and out() respectively when they are called.  These
+   descriptors can be optionally used to pass any information that the caller-
+   supplied in() and out() functions need to do their job.
+
+     On return, inflateBack() will set strm->next_in and strm->avail_in to
+   pass back any unused input that was provided by the last in() call.  The
+   return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR
+   if in() or out() returned an error, Z_DATA_ERROR if there was a format error
+   in the deflate stream (in which case strm->msg is set to indicate the nature
+   of the error), or Z_STREAM_ERROR if the stream was not properly initialized.
+   In the case of Z_BUF_ERROR, an input or output error can be distinguished
+   using strm->next_in which will be Z_NULL only if in() returned an error.  If
+   strm->next_in is not Z_NULL, then the Z_BUF_ERROR was due to out() returning
+   non-zero.  (in() will always be called before out(), so strm->next_in is
+   assured to be defined if out() returns non-zero.)  Note that inflateBack()
+   cannot return Z_OK.
+*/
+
+ZEXTERN int ZEXPORT inflateBackEnd(z_streamp strm);
+/*
+     All memory allocated by inflateBackInit() is freed.
+
+     inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream
+   state was inconsistent.
+*/
+
+ZEXTERN uLong ZEXPORT zlibCompileFlags(void);
+/* Return flags indicating compile-time options.
+
+    Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other:
+     1.0: size of uInt
+     3.2: size of uLong
+     5.4: size of voidpf (pointer)
+     7.6: size of z_off_t
+
+    Compiler, assembler, and debug options:
+     8: ZLIB_DEBUG
+     9: ASMV or ASMINF -- use ASM code
+     10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention
+     11: 0 (reserved)
+
+    One-time table building (smaller code, but not thread-safe if true):
+     12: BUILDFIXED -- build static block decoding tables when needed
+     13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed
+     14,15: 0 (reserved)
+
+    Library content (indicates missing functionality):
+     16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking
+                          deflate code when not needed)
+     17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect
+                    and decode gzip streams (to avoid linking crc code)
+     18-19: 0 (reserved)
+
+    Operation variations (changes in library functionality):
+     20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate
+     21: FASTEST -- deflate algorithm with only one, lowest compression level
+     22,23: 0 (reserved)
+
+    The sprintf variant used by gzprintf (zero is best):
+     24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format
+     25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure!
+     26: 0 = returns value, 1 = void -- 1 means inferred string length returned
+
+    Remainder:
+     27-31: 0 (reserved)
+ */
+
+#ifndef Z_SOLO
+
+                        /* utility functions */
+
+/*
+     The following utility functions are implemented on top of the basic
+   stream-oriented functions.  To simplify the interface, some default options
+   are assumed (compression level and memory usage, standard memory allocation
+   functions).  The source code of these utility functions can be modified if
+   you need special options.
+*/
+
+ZEXTERN int ZEXPORT compress(Bytef *dest,   uLongf *destLen,
+                             const Bytef *source, uLong sourceLen);
+/*
+     Compresses the source buffer into the destination buffer.  sourceLen is
+   the byte length of the source buffer.  Upon entry, destLen is the total size
+   of the destination buffer, which must be at least the value returned by
+   compressBound(sourceLen).  Upon exit, destLen is the actual size of the
+   compressed data.  compress() is equivalent to compress2() with a level
+   parameter of Z_DEFAULT_COMPRESSION.
+
+     compress returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_BUF_ERROR if there was not enough room in the output
+   buffer.
+*/
+
+ZEXTERN int ZEXPORT compress2(Bytef *dest,   uLongf *destLen,
+                              const Bytef *source, uLong sourceLen,
+                              int level);
+/*
+     Compresses the source buffer into the destination buffer.  The level
+   parameter has the same meaning as in deflateInit.  sourceLen is the byte
+   length of the source buffer.  Upon entry, destLen is the total size of the
+   destination buffer, which must be at least the value returned by
+   compressBound(sourceLen).  Upon exit, destLen is the actual size of the
+   compressed data.
+
+     compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+   Z_STREAM_ERROR if the level parameter is invalid.
+*/
+
+ZEXTERN uLong ZEXPORT compressBound(uLong sourceLen);
+/*
+     compressBound() returns an upper bound on the compressed size after
+   compress() or compress2() on sourceLen bytes.  It would be used before a
+   compress() or compress2() call to allocate the destination buffer.
+*/
+
+ZEXTERN int ZEXPORT uncompress(Bytef *dest,   uLongf *destLen,
+                               const Bytef *source, uLong sourceLen);
+/*
+     Decompresses the source buffer into the destination buffer.  sourceLen is
+   the byte length of the source buffer.  Upon entry, destLen is the total size
+   of the destination buffer, which must be large enough to hold the entire
+   uncompressed data.  (The size of the uncompressed data must have been saved
+   previously by the compressor and transmitted to the decompressor by some
+   mechanism outside the scope of this compression library.) Upon exit, destLen
+   is the actual size of the uncompressed data.
+
+     uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+   enough memory, Z_BUF_ERROR if there was not enough room in the output
+   buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete.  In
+   the case where there is not enough room, uncompress() will fill the output
+   buffer with the uncompressed data up to that point.
+*/
+
+ZEXTERN int ZEXPORT uncompress2(Bytef *dest,   uLongf *destLen,
+                                const Bytef *source, uLong *sourceLen);
+/*
+     Same as uncompress, except that sourceLen is a pointer, where the
+   length of the source is *sourceLen.  On return, *sourceLen is the number of
+   source bytes consumed.
+*/
+
+                        /* gzip file access functions */
+
+/*
+     This library supports reading and writing files in gzip (.gz) format with
+   an interface similar to that of stdio, using the functions that start with
+   "gz".  The gzip format is different from the zlib format.  gzip is a gzip
+   wrapper, documented in RFC 1952, wrapped around a deflate stream.
+*/
+
+typedef struct gzFile_s *gzFile;    /* semi-opaque gzip file descriptor */
+
+/*
+ZEXTERN gzFile ZEXPORT gzopen(const char *path, const char *mode);
+
+     Open the gzip (.gz) file at path for reading and decompressing, or
+   compressing and writing.  The mode parameter is as in fopen ("rb" or "wb")
+   but can also include a compression level ("wb9") or a strategy: 'f' for
+   filtered data as in "wb6f", 'h' for Huffman-only compression as in "wb1h",
+   'R' for run-length encoding as in "wb1R", or 'F' for fixed code compression
+   as in "wb9F".  (See the description of deflateInit2 for more information
+   about the strategy parameter.)  'T' will request transparent writing or
+   appending with no compression and not using the gzip format.
+
+     "a" can be used instead of "w" to request that the gzip stream that will
+   be written be appended to the file.  "+" will result in an error, since
+   reading and writing to the same gzip file is not supported.  The addition of
+   "x" when writing will create the file exclusively, which fails if the file
+   already exists.  On systems that support it, the addition of "e" when
+   reading or writing will set the flag to close the file on an execve() call.
+
+     These functions, as well as gzip, will read and decode a sequence of gzip
+   streams in a file.  The append function of gzopen() can be used to create
+   such a file.  (Also see gzflush() for another way to do this.)  When
+   appending, gzopen does not test whether the file begins with a gzip stream,
+   nor does it look for the end of the gzip streams to begin appending.  gzopen
+   will simply append a gzip stream to the existing file.
+
+     gzopen can be used to read a file which is not in gzip format; in this
+   case gzread will directly read from the file without decompression.  When
+   reading, this will be detected automatically by looking for the magic two-
+   byte gzip header.
+
+     gzopen returns NULL if the file could not be opened, if there was
+   insufficient memory to allocate the gzFile state, or if an invalid mode was
+   specified (an 'r', 'w', or 'a' was not provided, or '+' was provided).
+   errno can be checked to determine if the reason gzopen failed was that the
+   file could not be opened.
+*/
+
+ZEXTERN gzFile ZEXPORT gzdopen(int fd, const char *mode);
+/*
+     Associate a gzFile with the file descriptor fd.  File descriptors are
+   obtained from calls like open, dup, creat, pipe or fileno (if the file has
+   been previously opened with fopen).  The mode parameter is as in gzopen.
+
+     The next call of gzclose on the returned gzFile will also close the file
+   descriptor fd, just like fclose(fdopen(fd, mode)) closes the file descriptor
+   fd.  If you want to keep fd open, use fd = dup(fd_keep); gz = gzdopen(fd,
+   mode);.  The duplicated descriptor should be saved to avoid a leak, since
+   gzdopen does not close fd if it fails.  If you are using fileno() to get the
+   file descriptor from a FILE *, then you will have to use dup() to avoid
+   double-close()ing the file descriptor.  Both gzclose() and fclose() will
+   close the associated file descriptor, so they need to have different file
+   descriptors.
+
+     gzdopen returns NULL if there was insufficient memory to allocate the
+   gzFile state, if an invalid mode was specified (an 'r', 'w', or 'a' was not
+   provided, or '+' was provided), or if fd is -1.  The file descriptor is not
+   used until the next gz* read, write, seek, or close operation, so gzdopen
+   will not detect if fd is invalid (unless fd is -1).
+*/
+
+ZEXTERN int ZEXPORT gzbuffer(gzFile file, unsigned size);
+/*
+     Set the internal buffer size used by this library's functions for file to
+   size.  The default buffer size is 8192 bytes.  This function must be called
+   after gzopen() or gzdopen(), and before any other calls that read or write
+   the file.  The buffer memory allocation is always deferred to the first read
+   or write.  Three times that size in buffer space is allocated.  A larger
+   buffer size of, for example, 64K or 128K bytes will noticeably increase the
+   speed of decompression (reading).
+
+     The new buffer size also affects the maximum length for gzprintf().
+
+     gzbuffer() returns 0 on success, or -1 on failure, such as being called
+   too late.
+*/
+
+ZEXTERN int ZEXPORT gzsetparams(gzFile file, int level, int strategy);
+/*
+     Dynamically update the compression level and strategy for file.  See the
+   description of deflateInit2 for the meaning of these parameters. Previously
+   provided data is flushed before applying the parameter changes.
+
+     gzsetparams returns Z_OK if success, Z_STREAM_ERROR if the file was not
+   opened for writing, Z_ERRNO if there is an error writing the flushed data,
+   or Z_MEM_ERROR if there is a memory allocation error.
+*/
+
+ZEXTERN int ZEXPORT gzread(gzFile file, voidp buf, unsigned len);
+/*
+     Read and decompress up to len uncompressed bytes from file into buf.  If
+   the input file is not in gzip format, gzread copies the given number of
+   bytes into the buffer directly from the file.
+
+     After reaching the end of a gzip stream in the input, gzread will continue
+   to read, looking for another gzip stream.  Any number of gzip streams may be
+   concatenated in the input file, and will all be decompressed by gzread().
+   If something other than a gzip stream is encountered after a gzip stream,
+   that remaining trailing garbage is ignored (and no error is returned).
+
+     gzread can be used to read a gzip file that is being concurrently written.
+   Upon reaching the end of the input, gzread will return with the available
+   data.  If the error code returned by gzerror is Z_OK or Z_BUF_ERROR, then
+   gzclearerr can be used to clear the end of file indicator in order to permit
+   gzread to be tried again.  Z_OK indicates that a gzip stream was completed
+   on the last gzread.  Z_BUF_ERROR indicates that the input file ended in the
+   middle of a gzip stream.  Note that gzread does not return -1 in the event
+   of an incomplete gzip stream.  This error is deferred until gzclose(), which
+   will return Z_BUF_ERROR if the last gzread ended in the middle of a gzip
+   stream.  Alternatively, gzerror can be used before gzclose to detect this
+   case.
+
+     gzread returns the number of uncompressed bytes actually read, less than
+   len for end of file, or -1 for error.  If len is too large to fit in an int,
+   then nothing is read, -1 is returned, and the error state is set to
+   Z_STREAM_ERROR.
+*/
+
+ZEXTERN z_size_t ZEXPORT gzfread(voidp buf, z_size_t size, z_size_t nitems,
+                                 gzFile file);
+/*
+     Read and decompress up to nitems items of size size from file into buf,
+   otherwise operating as gzread() does.  This duplicates the interface of
+   stdio's fread(), with size_t request and return types.  If the library
+   defines size_t, then z_size_t is identical to size_t.  If not, then z_size_t
+   is an unsigned integer type that can contain a pointer.
+
+     gzfread() returns the number of full items read of size size, or zero if
+   the end of the file was reached and a full item could not be read, or if
+   there was an error.  gzerror() must be consulted if zero is returned in
+   order to determine if there was an error.  If the multiplication of size and
+   nitems overflows, i.e. the product does not fit in a z_size_t, then nothing
+   is read, zero is returned, and the error state is set to Z_STREAM_ERROR.
+
+     In the event that the end of file is reached and only a partial item is
+   available at the end, i.e. the remaining uncompressed data length is not a
+   multiple of size, then the final partial item is nevertheless read into buf
+   and the end-of-file flag is set.  The length of the partial item read is not
+   provided, but could be inferred from the result of gztell().  This behavior
+   is the same as the behavior of fread() implementations in common libraries,
+   but it prevents the direct use of gzfread() to read a concurrently written
+   file, resetting and retrying on end-of-file, when size is not 1.
+*/
+
+ZEXTERN int ZEXPORT gzwrite(gzFile file, voidpc buf, unsigned len);
+/*
+     Compress and write the len uncompressed bytes at buf to file. gzwrite
+   returns the number of uncompressed bytes written or 0 in case of error.
+*/
+
+ZEXTERN z_size_t ZEXPORT gzfwrite(voidpc buf, z_size_t size,
+                                  z_size_t nitems, gzFile file);
+/*
+     Compress and write nitems items of size size from buf to file, duplicating
+   the interface of stdio's fwrite(), with size_t request and return types.  If
+   the library defines size_t, then z_size_t is identical to size_t.  If not,
+   then z_size_t is an unsigned integer type that can contain a pointer.
+
+     gzfwrite() returns the number of full items written of size size, or zero
+   if there was an error.  If the multiplication of size and nitems overflows,
+   i.e. the product does not fit in a z_size_t, then nothing is written, zero
+   is returned, and the error state is set to Z_STREAM_ERROR.
+*/
+
+ZEXTERN int ZEXPORTVA gzprintf(gzFile file, const char *format, ...);
+/*
+     Convert, format, compress, and write the arguments (...) to file under
+   control of the string format, as in fprintf.  gzprintf returns the number of
+   uncompressed bytes actually written, or a negative zlib error code in case
+   of error.  The number of uncompressed bytes written is limited to 8191, or
+   one less than the buffer size given to gzbuffer().  The caller should assure
+   that this limit is not exceeded.  If it is exceeded, then gzprintf() will
+   return an error (0) with nothing written.  In this case, there may also be a
+   buffer overflow with unpredictable consequences, which is possible only if
+   zlib was compiled with the insecure functions sprintf() or vsprintf(),
+   because the secure snprintf() or vsnprintf() functions were not available.
+   This can be determined using zlibCompileFlags().
+*/
+
+ZEXTERN int ZEXPORT gzputs(gzFile file, const char *s);
+/*
+     Compress and write the given null-terminated string s to file, excluding
+   the terminating null character.
+
+     gzputs returns the number of characters written, or -1 in case of error.
+*/
+
+ZEXTERN char * ZEXPORT gzgets(gzFile file, char *buf, int len);
+/*
+     Read and decompress bytes from file into buf, until len-1 characters are
+   read, or until a newline character is read and transferred to buf, or an
+   end-of-file condition is encountered.  If any characters are read or if len
+   is one, the string is terminated with a null character.  If no characters
+   are read due to an end-of-file or len is less than one, then the buffer is
+   left untouched.
+
+     gzgets returns buf which is a null-terminated string, or it returns NULL
+   for end-of-file or in case of error.  If there was an error, the contents at
+   buf are indeterminate.
+*/
+
+ZEXTERN int ZEXPORT gzputc(gzFile file, int c);
+/*
+     Compress and write c, converted to an unsigned char, into file.  gzputc
+   returns the value that was written, or -1 in case of error.
+*/
+
+ZEXTERN int ZEXPORT gzgetc(gzFile file);
+/*
+     Read and decompress one byte from file.  gzgetc returns this byte or -1
+   in case of end of file or error.  This is implemented as a macro for speed.
+   As such, it does not do all of the checking the other functions do.  I.e.
+   it does not check to see if file is NULL, nor whether the structure file
+   points to has been clobbered or not.
+*/
+
+ZEXTERN int ZEXPORT gzungetc(int c, gzFile file);
+/*
+     Push c back onto the stream for file to be read as the first character on
+   the next read.  At least one character of push-back is always allowed.
+   gzungetc() returns the character pushed, or -1 on failure.  gzungetc() will
+   fail if c is -1, and may fail if a character has been pushed but not read
+   yet.  If gzungetc is used immediately after gzopen or gzdopen, at least the
+   output buffer size of pushed characters is allowed.  (See gzbuffer above.)
+   The pushed character will be discarded if the stream is repositioned with
+   gzseek() or gzrewind().
+*/
+
+ZEXTERN int ZEXPORT gzflush(gzFile file, int flush);
+/*
+     Flush all pending output to file.  The parameter flush is as in the
+   deflate() function.  The return value is the zlib error number (see function
+   gzerror below).  gzflush is only permitted when writing.
+
+     If the flush parameter is Z_FINISH, the remaining data is written and the
+   gzip stream is completed in the output.  If gzwrite() is called again, a new
+   gzip stream will be started in the output.  gzread() is able to read such
+   concatenated gzip streams.
+
+     gzflush should be called only when strictly necessary because it will
+   degrade compression if called too often.
+*/
+
+/*
+ZEXTERN z_off_t ZEXPORT gzseek(gzFile file,
+                               z_off_t offset, int whence);
+
+     Set the starting position to offset relative to whence for the next gzread
+   or gzwrite on file.  The offset represents a number of bytes in the
+   uncompressed data stream.  The whence parameter is defined as in lseek(2);
+   the value SEEK_END is not supported.
+
+     If the file is opened for reading, this function is emulated but can be
+   extremely slow.  If the file is opened for writing, only forward seeks are
+   supported; gzseek then compresses a sequence of zeroes up to the new
+   starting position.
+
+     gzseek returns the resulting offset location as measured in bytes from
+   the beginning of the uncompressed stream, or -1 in case of error, in
+   particular if the file is opened for writing and the new starting position
+   would be before the current position.
+*/
+
+ZEXTERN int ZEXPORT    gzrewind(gzFile file);
+/*
+     Rewind file. This function is supported only for reading.
+
+     gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET).
+*/
+
+/*
+ZEXTERN z_off_t ZEXPORT    gztell(gzFile file);
+
+     Return the starting position for the next gzread or gzwrite on file.
+   This position represents a number of bytes in the uncompressed data stream,
+   and is zero when starting, even if appending or reading a gzip stream from
+   the middle of a file using gzdopen().
+
+     gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR)
+*/
+
+/*
+ZEXTERN z_off_t ZEXPORT gzoffset(gzFile file);
+
+     Return the current compressed (actual) read or write offset of file.  This
+   offset includes the count of bytes that precede the gzip stream, for example
+   when appending or when using gzdopen() for reading.  When reading, the
+   offset does not include as yet unused buffered input.  This information can
+   be used for a progress indicator.  On error, gzoffset() returns -1.
+*/
+
+ZEXTERN int ZEXPORT gzeof(gzFile file);
+/*
+     Return true (1) if the end-of-file indicator for file has been set while
+   reading, false (0) otherwise.  Note that the end-of-file indicator is set
+   only if the read tried to go past the end of the input, but came up short.
+   Therefore, just like feof(), gzeof() may return false even if there is no
+   more data to read, in the event that the last read request was for the exact
+   number of bytes remaining in the input file.  This will happen if the input
+   file size is an exact multiple of the buffer size.
+
+     If gzeof() returns true, then the read functions will return no more data,
+   unless the end-of-file indicator is reset by gzclearerr() and the input file
+   has grown since the previous end of file was detected.
+*/
+
+ZEXTERN int ZEXPORT gzdirect(gzFile file);
+/*
+     Return true (1) if file is being copied directly while reading, or false
+   (0) if file is a gzip stream being decompressed.
+
+     If the input file is empty, gzdirect() will return true, since the input
+   does not contain a gzip stream.
+
+     If gzdirect() is used immediately after gzopen() or gzdopen() it will
+   cause buffers to be allocated to allow reading the file to determine if it
+   is a gzip file.  Therefore if gzbuffer() is used, it should be called before
+   gzdirect().
+
+     When writing, gzdirect() returns true (1) if transparent writing was
+   requested ("wT" for the gzopen() mode), or false (0) otherwise.  (Note:
+   gzdirect() is not needed when writing.  Transparent writing must be
+   explicitly requested, so the application already knows the answer.  When
+   linking statically, using gzdirect() will include all of the zlib code for
+   gzip file reading and decompression, which may not be desired.)
+*/
+
+ZEXTERN int ZEXPORT    gzclose(gzFile file);
+/*
+     Flush all pending output for file, if necessary, close file and
+   deallocate the (de)compression state.  Note that once file is closed, you
+   cannot call gzerror with file, since its structures have been deallocated.
+   gzclose must not be called more than once on the same file, just as free
+   must not be called more than once on the same allocation.
+
+     gzclose will return Z_STREAM_ERROR if file is not valid, Z_ERRNO on a
+   file operation error, Z_MEM_ERROR if out of memory, Z_BUF_ERROR if the
+   last read ended in the middle of a gzip stream, or Z_OK on success.
+*/
+
+ZEXTERN int ZEXPORT gzclose_r(gzFile file);
+ZEXTERN int ZEXPORT gzclose_w(gzFile file);
+/*
+     Same as gzclose(), but gzclose_r() is only for use when reading, and
+   gzclose_w() is only for use when writing or appending.  The advantage to
+   using these instead of gzclose() is that they avoid linking in zlib
+   compression or decompression code that is not used when only reading or only
+   writing respectively.  If gzclose() is used, then both compression and
+   decompression code will be included the application when linking to a static
+   zlib library.
+*/
+
+ZEXTERN const char * ZEXPORT gzerror(gzFile file, int *errnum);
+/*
+     Return the error message for the last error which occurred on file.
+   errnum is set to zlib error number.  If an error occurred in the file system
+   and not in the compression library, errnum is set to Z_ERRNO and the
+   application may consult errno to get the exact error code.
+
+     The application must not modify the returned string.  Future calls to
+   this function may invalidate the previously returned string.  If file is
+   closed, then the string previously returned by gzerror will no longer be
+   available.
+
+     gzerror() should be used to distinguish errors from end-of-file for those
+   functions above that do not distinguish those cases in their return values.
+*/
+
+ZEXTERN void ZEXPORT gzclearerr(gzFile file);
+/*
+     Clear the error and end-of-file flags for file.  This is analogous to the
+   clearerr() function in stdio.  This is useful for continuing to read a gzip
+   file that is being written concurrently.
+*/
+
+#endif /* !Z_SOLO */
+
+                        /* checksum functions */
+
+/*
+     These functions are not related to compression but are exported
+   anyway because they might be useful in applications using the compression
+   library.
+*/
+
+ZEXTERN uLong ZEXPORT adler32(uLong adler, const Bytef *buf, uInt len);
+/*
+     Update a running Adler-32 checksum with the bytes buf[0..len-1] and
+   return the updated checksum. An Adler-32 value is in the range of a 32-bit
+   unsigned integer. If buf is Z_NULL, this function returns the required
+   initial value for the checksum.
+
+     An Adler-32 checksum is almost as reliable as a CRC-32 but can be computed
+   much faster.
+
+   Usage example:
+
+     uLong adler = adler32(0L, Z_NULL, 0);
+
+     while (read_buffer(buffer, length) != EOF) {
+       adler = adler32(adler, buffer, length);
+     }
+     if (adler != original_adler) error();
+*/
+
+ZEXTERN uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf,
+                                z_size_t len);
+/*
+     Same as adler32(), but with a size_t length.
+*/
+
+/*
+ZEXTERN uLong ZEXPORT adler32_combine(uLong adler1, uLong adler2,
+                                      z_off_t len2);
+
+     Combine two Adler-32 checksums into one.  For two sequences of bytes, seq1
+   and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for
+   each, adler1 and adler2.  adler32_combine() returns the Adler-32 checksum of
+   seq1 and seq2 concatenated, requiring only adler1, adler2, and len2.  Note
+   that the z_off_t type (like off_t) is a signed integer.  If len2 is
+   negative, the result has no meaning or utility.
+*/
+
+ZEXTERN uLong ZEXPORT crc32(uLong crc, const Bytef *buf, uInt len);
+/*
+     Update a running CRC-32 with the bytes buf[0..len-1] and return the
+   updated CRC-32. A CRC-32 value is in the range of a 32-bit unsigned integer.
+   If buf is Z_NULL, this function returns the required initial value for the
+   crc. Pre- and post-conditioning (one's complement) is performed within this
+   function so it shouldn't be done by the application.
+
+   Usage example:
+
+     uLong crc = crc32(0L, Z_NULL, 0);
+
+     while (read_buffer(buffer, length) != EOF) {
+       crc = crc32(crc, buffer, length);
+     }
+     if (crc != original_crc) error();
+*/
+
+ZEXTERN uLong ZEXPORT crc32_z(uLong crc, const Bytef *buf,
+                              z_size_t len);
+/*
+     Same as crc32(), but with a size_t length.
+*/
+
+/*
+ZEXTERN uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2);
+
+     Combine two CRC-32 check values into one.  For two sequences of bytes,
+   seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
+   calculated for each, crc1 and crc2.  crc32_combine() returns the CRC-32
+   check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and
+   len2. len2 must be non-negative.
+*/
+
+/*
+ZEXTERN uLong ZEXPORT crc32_combine_gen(z_off_t len2);
+
+     Return the operator corresponding to length len2, to be used with
+   crc32_combine_op(). len2 must be non-negative.
+*/
+
+ZEXTERN uLong ZEXPORT crc32_combine_op(uLong crc1, uLong crc2, uLong op);
+/*
+     Give the same result as crc32_combine(), using op in place of len2. op is
+   is generated from len2 by crc32_combine_gen(). This will be faster than
+   crc32_combine() if the generated op is used more than once.
+*/
+
+
+                        /* various hacks, don't look :) */
+
+/* deflateInit and inflateInit are macros to allow checking the zlib version
+ * and the compiler's view of z_stream:
+ */
+ZEXTERN int ZEXPORT deflateInit_(z_streamp strm, int level,
+                                 const char *version, int stream_size);
+ZEXTERN int ZEXPORT inflateInit_(z_streamp strm,
+                                 const char *version, int stream_size);
+ZEXTERN int ZEXPORT deflateInit2_(z_streamp strm, int  level, int  method,
+                                  int windowBits, int memLevel,
+                                  int strategy, const char *version,
+                                  int stream_size);
+ZEXTERN int ZEXPORT inflateInit2_(z_streamp strm, int  windowBits,
+                                  const char *version, int stream_size);
+ZEXTERN int ZEXPORT inflateBackInit_(z_streamp strm, int windowBits,
+                                     unsigned char FAR *window,
+                                     const char *version,
+                                     int stream_size);
+#ifdef Z_PREFIX_SET
+#  define z_deflateInit(strm, level) \
+          deflateInit_((strm), (level), ZLIB_VERSION, (int)sizeof(z_stream))
+#  define z_inflateInit(strm) \
+          inflateInit_((strm), ZLIB_VERSION, (int)sizeof(z_stream))
+#  define z_deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
+          deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
+                        (strategy), ZLIB_VERSION, (int)sizeof(z_stream))
+#  define z_inflateInit2(strm, windowBits) \
+          inflateInit2_((strm), (windowBits), ZLIB_VERSION, \
+                        (int)sizeof(z_stream))
+#  define z_inflateBackInit(strm, windowBits, window) \
+          inflateBackInit_((strm), (windowBits), (window), \
+                           ZLIB_VERSION, (int)sizeof(z_stream))
+#else
+#  define deflateInit(strm, level) \
+          deflateInit_((strm), (level), ZLIB_VERSION, (int)sizeof(z_stream))
+#  define inflateInit(strm) \
+          inflateInit_((strm), ZLIB_VERSION, (int)sizeof(z_stream))
+#  define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
+          deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
+                        (strategy), ZLIB_VERSION, (int)sizeof(z_stream))
+#  define inflateInit2(strm, windowBits) \
+          inflateInit2_((strm), (windowBits), ZLIB_VERSION, \
+                        (int)sizeof(z_stream))
+#  define inflateBackInit(strm, windowBits, window) \
+          inflateBackInit_((strm), (windowBits), (window), \
+                           ZLIB_VERSION, (int)sizeof(z_stream))
+#endif
+
+#ifndef Z_SOLO
+
+/* gzgetc() macro and its supporting function and exposed data structure.  Note
+ * that the real internal state is much larger than the exposed structure.
+ * This abbreviated structure exposes just enough for the gzgetc() macro.  The
+ * user should not mess with these exposed elements, since their names or
+ * behavior could change in the future, perhaps even capriciously.  They can
+ * only be used by the gzgetc() macro.  You have been warned.
+ */
+struct gzFile_s {
+    unsigned have;
+    unsigned char *next;
+    z_off64_t pos;
+};
+ZEXTERN int ZEXPORT gzgetc_(gzFile file);       /* backward compatibility */
+#ifdef Z_PREFIX_SET
+#  undef z_gzgetc
+#  define z_gzgetc(g) \
+          ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : (gzgetc)(g))
+#else
+#  define gzgetc(g) \
+          ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : (gzgetc)(g))
+#endif
+
+/* provide 64-bit offset functions if _LARGEFILE64_SOURCE defined, and/or
+ * change the regular functions to 64 bits if _FILE_OFFSET_BITS is 64 (if
+ * both are true, the application gets the *64 functions, and the regular
+ * functions are changed to 64 bits) -- in case these are set on systems
+ * without large file support, _LFS64_LARGEFILE must also be true
+ */
+#ifdef Z_LARGE64
+   ZEXTERN gzFile ZEXPORT gzopen64(const char *, const char *);
+   ZEXTERN z_off64_t ZEXPORT gzseek64(gzFile, z_off64_t, int);
+   ZEXTERN z_off64_t ZEXPORT gztell64(gzFile);
+   ZEXTERN z_off64_t ZEXPORT gzoffset64(gzFile);
+   ZEXTERN uLong ZEXPORT adler32_combine64(uLong, uLong, z_off64_t);
+   ZEXTERN uLong ZEXPORT crc32_combine64(uLong, uLong, z_off64_t);
+   ZEXTERN uLong ZEXPORT crc32_combine_gen64(z_off64_t);
+#endif
+
+#if !defined(ZLIB_INTERNAL) && defined(Z_WANT64)
+#  ifdef Z_PREFIX_SET
+#    define z_gzopen z_gzopen64
+#    define z_gzseek z_gzseek64
+#    define z_gztell z_gztell64
+#    define z_gzoffset z_gzoffset64
+#    define z_adler32_combine z_adler32_combine64
+#    define z_crc32_combine z_crc32_combine64
+#    define z_crc32_combine_gen z_crc32_combine_gen64
+#  else
+#    define gzopen gzopen64
+#    define gzseek gzseek64
+#    define gztell gztell64
+#    define gzoffset gzoffset64
+#    define adler32_combine adler32_combine64
+#    define crc32_combine crc32_combine64
+#    define crc32_combine_gen crc32_combine_gen64
+#  endif
+#  ifndef Z_LARGE64
+     ZEXTERN gzFile ZEXPORT gzopen64(const char *, const char *);
+     ZEXTERN z_off_t ZEXPORT gzseek64(gzFile, z_off_t, int);
+     ZEXTERN z_off_t ZEXPORT gztell64(gzFile);
+     ZEXTERN z_off_t ZEXPORT gzoffset64(gzFile);
+     ZEXTERN uLong ZEXPORT adler32_combine64(uLong, uLong, z_off_t);
+     ZEXTERN uLong ZEXPORT crc32_combine64(uLong, uLong, z_off_t);
+     ZEXTERN uLong ZEXPORT crc32_combine_gen64(z_off_t);
+#  endif
+#else
+   ZEXTERN gzFile ZEXPORT gzopen(const char *, const char *);
+   ZEXTERN z_off_t ZEXPORT gzseek(gzFile, z_off_t, int);
+   ZEXTERN z_off_t ZEXPORT gztell(gzFile);
+   ZEXTERN z_off_t ZEXPORT gzoffset(gzFile);
+   ZEXTERN uLong ZEXPORT adler32_combine(uLong, uLong, z_off_t);
+   ZEXTERN uLong ZEXPORT crc32_combine(uLong, uLong, z_off_t);
+   ZEXTERN uLong ZEXPORT crc32_combine_gen(z_off_t);
+#endif
+
+#else /* Z_SOLO */
+
+   ZEXTERN uLong ZEXPORT adler32_combine(uLong, uLong, z_off_t);
+   ZEXTERN uLong ZEXPORT crc32_combine(uLong, uLong, z_off_t);
+   ZEXTERN uLong ZEXPORT crc32_combine_gen(z_off_t);
+
+#endif /* !Z_SOLO */
+
+/* undocumented functions */
+ZEXTERN const char   * ZEXPORT zError(int);
+ZEXTERN int            ZEXPORT inflateSyncPoint(z_streamp);
+ZEXTERN const z_crc_t FAR * ZEXPORT get_crc_table(void);
+ZEXTERN int            ZEXPORT inflateUndermine(z_streamp, int);
+ZEXTERN int            ZEXPORT inflateValidate(z_streamp, int);
+ZEXTERN unsigned long  ZEXPORT inflateCodesUsed(z_streamp);
+ZEXTERN int            ZEXPORT inflateResetKeep(z_streamp);
+ZEXTERN int            ZEXPORT deflateResetKeep(z_streamp);
+#if defined(_WIN32) && !defined(Z_SOLO)
+ZEXTERN gzFile         ZEXPORT gzopen_w(const wchar_t *path,
+                                        const char *mode);
+#endif
+#if defined(STDC) || defined(Z_HAVE_STDARG_H)
+#  ifndef Z_SOLO
+ZEXTERN int            ZEXPORTVA gzvprintf(gzFile file,
+                                           const char *format,
+                                           va_list va);
+#  endif
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZLIB_H */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/htmlparser.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/htmlparser.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..31dcc406cdc3d006afe811e7e1f778b56407510f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/htmlparser.pxd
@@ -0,0 +1,56 @@
+from libc.string cimport const_char
+
+from lxml.includes.tree cimport xmlDoc
+from lxml.includes.tree cimport xmlInputReadCallback, xmlInputCloseCallback
+from lxml.includes.xmlparser cimport xmlParserCtxt, xmlSAXHandler, xmlSAXHandlerV1
+
+cdef extern from "libxml/HTMLparser.h" nogil:
+    ctypedef enum htmlParserOption:
+        HTML_PARSE_NOERROR    # suppress error reports
+        HTML_PARSE_NOWARNING  # suppress warning reports
+        HTML_PARSE_PEDANTIC   # pedantic error reporting
+        HTML_PARSE_NOBLANKS   # remove blank nodes
+        HTML_PARSE_NONET      # Forbid network access
+        # libxml2 2.6.21+ only:
+        HTML_PARSE_RECOVER    # Relaxed parsing
+        HTML_PARSE_COMPACT    # compact small text nodes
+        # libxml2 2.7.7+ only:
+        HTML_PARSE_NOIMPLIED  # Do not add implied html/body... elements
+        # libxml2 2.7.8+ only:
+        HTML_PARSE_NODEFDTD   # do not default a doctype if not found
+        # libxml2 2.8.0+ only:
+        XML_PARSE_IGNORE_ENC  # ignore internal document encoding hint
+
+    xmlSAXHandlerV1 htmlDefaultSAXHandler
+
+    cdef xmlParserCtxt* htmlCreateMemoryParserCtxt(
+        char* buffer, int size)
+    cdef xmlParserCtxt* htmlCreateFileParserCtxt(
+        char* filename, char* encoding)
+    cdef xmlParserCtxt* htmlCreatePushParserCtxt(xmlSAXHandler* sax,
+                                                 void* user_data,
+                                                 char* chunk, int size,
+                                                 char* filename, int enc)
+    cdef void htmlFreeParserCtxt(xmlParserCtxt* ctxt)
+    cdef void htmlCtxtReset(xmlParserCtxt* ctxt)
+    cdef int htmlCtxtUseOptions(xmlParserCtxt* ctxt, int options)
+    cdef int htmlParseDocument(xmlParserCtxt* ctxt)
+    cdef int htmlParseChunk(xmlParserCtxt* ctxt, 
+                            char* chunk, int size, int terminate)
+
+    cdef xmlDoc* htmlCtxtReadFile(xmlParserCtxt* ctxt,
+                                  char* filename, const_char* encoding,
+                                  int options)
+    cdef xmlDoc* htmlCtxtReadDoc(xmlParserCtxt* ctxt,
+                                 char* buffer, char* URL, const_char* encoding,
+                                 int options)
+    cdef xmlDoc* htmlCtxtReadIO(xmlParserCtxt* ctxt, 
+                                xmlInputReadCallback ioread, 
+                                xmlInputCloseCallback ioclose, 
+                                void* ioctx,
+                                char* URL, const_char* encoding,
+                                int options)
+    cdef xmlDoc* htmlCtxtReadMemory(xmlParserCtxt* ctxt,
+                                    char* buffer, int size,
+                                    char* filename, const_char* encoding,
+                                    int options)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libexslt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libexslt/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..baeab1cb4e5688692c8f4ff813cf7f10e7904e32
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libexslt/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxml/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxml/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea4697672b43dde3663b43831beaa7f91e75bebf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxml/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3e07a3aaf90df3db76528fed2a67ee4772e6eeb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/numbersInternals.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/numbersInternals.h
new file mode 100644
index 0000000000000000000000000000000000000000..8524592811aedebb82db9412b95c4bb50d918503
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/numbersInternals.h
@@ -0,0 +1,73 @@
+/*
+ * Summary: Implementation of the XSLT number functions
+ * Description: Implementation of the XSLT number functions
+ *
+ * Copy: See Copyright for the status of this software.
+ *
+ * Author: Bjorn Reese <breese@users.sourceforge.net> and Daniel Veillard
+ */
+
+#ifndef __XML_XSLT_NUMBERSINTERNALS_H__
+#define __XML_XSLT_NUMBERSINTERNALS_H__
+
+#include <libxml/tree.h>
+#include "xsltexports.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct _xsltCompMatch;
+
+/**
+ * xsltNumberData:
+ *
+ * This data structure is just a wrapper to pass xsl:number data in.
+ */
+typedef struct _xsltNumberData xsltNumberData;
+typedef xsltNumberData *xsltNumberDataPtr;
+
+struct _xsltNumberData {
+    const xmlChar *level;
+    const xmlChar *count;
+    const xmlChar *from;
+    const xmlChar *value;
+    const xmlChar *format;
+    int has_format;
+    int digitsPerGroup;
+    int groupingCharacter;
+    int groupingCharacterLen;
+    xmlDocPtr doc;
+    xmlNodePtr node;
+    struct _xsltCompMatch *countPat;
+    struct _xsltCompMatch *fromPat;
+
+    /*
+     * accelerators
+     */
+};
+
+/**
+ * xsltFormatNumberInfo,:
+ *
+ * This data structure lists the various parameters needed to format numbers.
+ */
+typedef struct _xsltFormatNumberInfo xsltFormatNumberInfo;
+typedef xsltFormatNumberInfo *xsltFormatNumberInfoPtr;
+
+struct _xsltFormatNumberInfo {
+    int	    integer_hash;	/* Number of '#' in integer part */
+    int	    integer_digits;	/* Number of '0' in integer part */
+    int	    frac_digits;	/* Number of '0' in fractional part */
+    int	    frac_hash;		/* Number of '#' in fractional part */
+    int	    group;		/* Number of chars per display 'group' */
+    int     multiplier;		/* Scaling for percent or permille */
+    char    add_decimal;	/* Flag for whether decimal point appears in pattern */
+    char    is_multiplier_set;	/* Flag to catch multiple occurences of percent/permille */
+    char    is_negative_pattern;/* Flag for processing -ve prefix/suffix */
+};
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* __XML_XSLT_NUMBERSINTERNALS_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/preproc.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/preproc.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a2fc7e4305e567464874ba6cb7bcba69f85efdf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/preproc.h
@@ -0,0 +1,43 @@
+/*
+ * Summary: precomputing stylesheets
+ * Description: this is the compilation phase, where most of the
+ *              stylesheet is "compiled" into faster to use data.
+ *
+ * Copy: See Copyright for the status of this software.
+ *
+ * Author: Daniel Veillard
+ */
+
+#ifndef __XML_XSLT_PRECOMP_H__
+#define __XML_XSLT_PRECOMP_H__
+
+#include <libxml/tree.h>
+#include "xsltexports.h"
+#include "xsltInternals.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Interfaces
+ */
+XSLTPUBVAR const xmlChar *xsltExtMarker;
+
+XSLTPUBFUN xsltElemPreCompPtr XSLTCALL
+		xsltDocumentComp	(xsltStylesheetPtr style,
+					 xmlNodePtr inst,
+					 xsltTransformFunction function);
+
+XSLTPUBFUN void XSLTCALL
+		xsltStylePreCompute	(xsltStylesheetPtr style,
+					 xmlNodePtr inst);
+XSLTPUBFUN void XSLTCALL
+		xsltFreeStylePreComps	(xsltStylesheetPtr style);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __XML_XSLT_PRECOMP_H__ */
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/xsltexports.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/xsltexports.h
new file mode 100644
index 0000000000000000000000000000000000000000..95c352fee2a808c4e3f12e4273ab4622eeb249b8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/libxslt/xsltexports.h
@@ -0,0 +1,64 @@
+/*
+ * Summary: macros for marking symbols as exportable/importable.
+ * Description: macros for marking symbols as exportable/importable.
+ *
+ * Copy: See Copyright for the status of this software.
+ */
+
+#ifndef __XSLT_EXPORTS_H__
+#define __XSLT_EXPORTS_H__
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+/** DOC_DISABLE */
+
+#ifdef LIBXSLT_STATIC
+  #define XSLTPUBLIC
+#elif defined(IN_LIBXSLT)
+  #define XSLTPUBLIC __declspec(dllexport)
+#else
+  #define XSLTPUBLIC __declspec(dllimport)
+#endif
+
+#define XSLTCALL __cdecl
+
+/** DOC_ENABLE */
+#else /* not Windows */
+
+/**
+ * XSLTPUBLIC:
+ *
+ * Macro which declares a public symbol
+ */
+#define XSLTPUBLIC
+
+/**
+ * XSLTCALL:
+ *
+ * Macro which declares the calling convention for exported functions
+ */
+#define XSLTCALL
+
+#endif /* platform switch */
+
+/*
+ * XSLTPUBFUN:
+ *
+ * Macro which declares an exportable function
+ */
+#define XSLTPUBFUN XSLTPUBLIC
+
+/**
+ * XSLTPUBVAR:
+ *
+ * Macro which declares an exportable variable
+ */
+#define XSLTPUBVAR XSLTPUBLIC extern
+
+/* Compatibility */
+#if !defined(LIBXSLT_PUBLIC)
+#define LIBXSLT_PUBLIC XSLTPUBVAR
+#endif
+
+#endif /* __XSLT_EXPORTS_H__ */
+
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/lxml-version.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/lxml-version.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ad0146eb7d6c49c603081547a08fdad13f20af4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/lxml-version.h
@@ -0,0 +1,3 @@
+#ifndef LXML_VERSION_STRING
+#define LXML_VERSION_STRING "6.0.2"
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/tree.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/tree.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..43a52e647be68a5aea00f7363df1df1de4f2a48c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/tree.pxd
@@ -0,0 +1,492 @@
+from libc cimport stdio
+from libc.string cimport const_char, const_uchar
+
+cdef extern from "lxml-version.h":
+    # deprecated declaration, use etreepublic.pxd instead
+    cdef char* LXML_VERSION_STRING
+
+cdef extern from "libxml/xmlversion.h":
+    const char* xmlParserVersion
+    int LIBXML_VERSION
+
+
+cdef extern from "libxml/xmlstring.h" nogil:
+    ctypedef unsigned char xmlChar
+    ctypedef const xmlChar const_xmlChar "const xmlChar"
+    cdef int xmlStrlen(const_xmlChar* str)
+    cdef xmlChar* xmlStrdup(const_xmlChar* cur)
+    cdef int xmlStrncmp(const_xmlChar* str1, const_xmlChar* str2, int length)
+    cdef int xmlStrcmp(const_xmlChar* str1, const_xmlChar* str2)
+    cdef int xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2)
+    cdef const_xmlChar* xmlStrstr(const_xmlChar* str1, const_xmlChar* str2)
+    cdef const_xmlChar* xmlStrchr(const_xmlChar* str1, xmlChar ch)
+    cdef const_xmlChar* _xcstr "(const xmlChar*)PyBytes_AS_STRING" (object s)
+
+cdef extern from "libxml/encoding.h" nogil:
+    ctypedef enum xmlCharEncoding:
+        XML_CHAR_ENCODING_ERROR = -1 # No char encoding detected
+        XML_CHAR_ENCODING_NONE = 0 # No char encoding detected
+        XML_CHAR_ENCODING_UTF8 = 1 # UTF-8
+        XML_CHAR_ENCODING_UTF16LE = 2 # UTF-16 little endian
+        XML_CHAR_ENCODING_UTF16BE = 3 # UTF-16 big endian
+        XML_CHAR_ENCODING_UCS4LE = 4 # UCS-4 little endian
+        XML_CHAR_ENCODING_UCS4BE = 5 # UCS-4 big endian
+        XML_CHAR_ENCODING_EBCDIC = 6 # EBCDIC uh!
+        XML_CHAR_ENCODING_UCS4_2143 = 7 # UCS-4 unusual ordering
+        XML_CHAR_ENCODING_UCS4_3412 = 8 # UCS-4 unusual ordering
+        XML_CHAR_ENCODING_UCS2 = 9 # UCS-2
+        XML_CHAR_ENCODING_8859_1 = 10 # ISO-8859-1 ISO Latin 1
+        XML_CHAR_ENCODING_8859_2 = 11 # ISO-8859-2 ISO Latin 2
+        XML_CHAR_ENCODING_8859_3 = 12 # ISO-8859-3
+        XML_CHAR_ENCODING_8859_4 = 13 # ISO-8859-4
+        XML_CHAR_ENCODING_8859_5 = 14 # ISO-8859-5
+        XML_CHAR_ENCODING_8859_6 = 15 # ISO-8859-6
+        XML_CHAR_ENCODING_8859_7 = 16 # ISO-8859-7
+        XML_CHAR_ENCODING_8859_8 = 17 # ISO-8859-8
+        XML_CHAR_ENCODING_8859_9 = 18 # ISO-8859-9
+        XML_CHAR_ENCODING_2022_JP = 19 # ISO-2022-JP
+        XML_CHAR_ENCODING_SHIFT_JIS = 20 # Shift_JIS
+        XML_CHAR_ENCODING_EUC_JP = 21 # EUC-JP
+        XML_CHAR_ENCODING_ASCII = 22 # pure ASCII
+
+    ctypedef struct xmlCharEncodingHandler:
+        char* name
+
+    cdef xmlCharEncodingHandler* xmlFindCharEncodingHandler(char* name)
+    cdef xmlCharEncodingHandler* xmlGetCharEncodingHandler(
+        xmlCharEncoding enc)
+    cdef int xmlCharEncCloseFunc(xmlCharEncodingHandler* handler)
+    cdef xmlCharEncoding xmlDetectCharEncoding(const_xmlChar* text, int len)
+    cdef const_char* xmlGetCharEncodingName(xmlCharEncoding enc)
+    cdef xmlCharEncoding xmlParseCharEncoding(char* name)
+    ctypedef int (*xmlCharEncodingOutputFunc)(
+            unsigned char *out_buf, int *outlen, const_uchar *in_buf, int *inlen)
+
+cdef extern from "libxml/chvalid.h" nogil:
+    cdef int xmlIsChar_ch(char c)
+    cdef int xmlIsCharQ(int ch)
+
+cdef extern from "libxml/hash.h":
+    ctypedef struct xmlHashTable
+    ctypedef void (*xmlHashScanner)(void* payload, void* data, const_xmlChar* name) noexcept  # may require GIL!
+    void xmlHashScan(xmlHashTable* table, xmlHashScanner f, void* data) nogil
+    void* xmlHashLookup(xmlHashTable* table, const_xmlChar* name) nogil
+    ctypedef void (*xmlHashDeallocator)(void *payload, xmlChar *name) noexcept
+    cdef xmlHashTable* xmlHashCreate(int size) nogil
+    cdef xmlHashTable* xmlHashCreateDict(int size, xmlDict *dict) nogil
+    cdef int xmlHashSize(xmlHashTable* table) nogil
+    cdef void xmlHashFree(xmlHashTable* table, xmlHashDeallocator f) nogil
+
+cdef extern from * nogil: # actually "libxml/dict.h"
+    # libxml/dict.h appears to be broken to include in C
+    ctypedef struct xmlDict
+    cdef const_xmlChar* xmlDictLookup(xmlDict* dict, const_xmlChar* name, int len)
+    cdef const_xmlChar* xmlDictExists(xmlDict* dict, const_xmlChar* name, int len)
+    cdef int xmlDictOwns(xmlDict* dict, const_xmlChar* name)
+    cdef size_t xmlDictSize(xmlDict* dict)
+
+cdef extern from "libxml/tree.h" nogil:
+    ctypedef struct xmlDoc
+    ctypedef struct xmlAttr
+    ctypedef struct xmlNotationTable
+
+    ctypedef enum xmlElementType:
+        XML_ELEMENT_NODE=           1
+        XML_ATTRIBUTE_NODE=         2
+        XML_TEXT_NODE=              3
+        XML_CDATA_SECTION_NODE=     4
+        XML_ENTITY_REF_NODE=        5
+        XML_ENTITY_NODE=            6
+        XML_PI_NODE=                7
+        XML_COMMENT_NODE=           8
+        XML_DOCUMENT_NODE=          9
+        XML_DOCUMENT_TYPE_NODE=     10
+        XML_DOCUMENT_FRAG_NODE=     11
+        XML_NOTATION_NODE=          12
+        XML_HTML_DOCUMENT_NODE=     13
+        XML_DTD_NODE=               14
+        XML_ELEMENT_DECL=           15
+        XML_ATTRIBUTE_DECL=         16
+        XML_ENTITY_DECL=            17
+        XML_NAMESPACE_DECL=         18
+        XML_XINCLUDE_START=         19
+        XML_XINCLUDE_END=           20
+
+    ctypedef enum xmlElementTypeVal:
+        XML_ELEMENT_TYPE_UNDEFINED= 0
+        XML_ELEMENT_TYPE_EMPTY=     1
+        XML_ELEMENT_TYPE_ANY=       2
+        XML_ELEMENT_TYPE_MIXED=     3
+        XML_ELEMENT_TYPE_ELEMENT=   4
+
+    ctypedef enum xmlElementContentType:
+        XML_ELEMENT_CONTENT_PCDATA=  1
+        XML_ELEMENT_CONTENT_ELEMENT= 2
+        XML_ELEMENT_CONTENT_SEQ=     3
+        XML_ELEMENT_CONTENT_OR=      4
+
+    ctypedef enum xmlElementContentOccur:
+        XML_ELEMENT_CONTENT_ONCE= 1
+        XML_ELEMENT_CONTENT_OPT=  2
+        XML_ELEMENT_CONTENT_MULT= 3
+        XML_ELEMENT_CONTENT_PLUS= 4
+
+    ctypedef enum xmlAttributeType:
+        XML_ATTRIBUTE_CDATA =      1
+        XML_ATTRIBUTE_ID=          2
+        XML_ATTRIBUTE_IDREF=       3
+        XML_ATTRIBUTE_IDREFS=      4
+        XML_ATTRIBUTE_ENTITY=      5
+        XML_ATTRIBUTE_ENTITIES=    6
+        XML_ATTRIBUTE_NMTOKEN=     7
+        XML_ATTRIBUTE_NMTOKENS=    8
+        XML_ATTRIBUTE_ENUMERATION= 9
+        XML_ATTRIBUTE_NOTATION=    10
+
+    ctypedef enum xmlAttributeDefault:
+        XML_ATTRIBUTE_NONE=     1
+        XML_ATTRIBUTE_REQUIRED= 2
+        XML_ATTRIBUTE_IMPLIED=  3
+        XML_ATTRIBUTE_FIXED=    4
+
+    ctypedef enum xmlEntityType:
+        XML_INTERNAL_GENERAL_ENTITY=          1
+        XML_EXTERNAL_GENERAL_PARSED_ENTITY=   2
+        XML_EXTERNAL_GENERAL_UNPARSED_ENTITY= 3
+        XML_INTERNAL_PARAMETER_ENTITY=        4
+        XML_EXTERNAL_PARAMETER_ENTITY=        5
+        XML_INTERNAL_PREDEFINED_ENTITY=       6
+
+    ctypedef enum xmlDocProperties:
+        XML_DOC_WELLFORMED          = 1    # /* document is XML well formed */
+        XML_DOC_NSVALID             = 2    # /* document is Namespace valid */
+        XML_DOC_OLD10               = 4    # /* parsed with old XML-1.0 parser */
+        XML_DOC_DTDVALID            = 8    # /* DTD validation was successful */
+        XML_DOC_XINCLUDE            = 16   # /* XInclude substitution was done */
+        XML_DOC_USERBUILT           = 32   # /* Document was built using the API
+                                           #    and not by parsing an instance */
+        XML_DOC_INTERNAL            = 64   # /* built for internal processing */
+        XML_DOC_HTML                = 128  # /* parsed or built HTML document */
+
+    ctypedef struct xmlNs:
+        const_xmlChar* href
+        const_xmlChar* prefix
+        xmlNs* next
+
+    ctypedef struct xmlNode:
+        void* _private
+        xmlElementType   type
+        const_xmlChar* name
+        xmlNode* children
+        xmlNode* last
+        xmlNode* parent
+        xmlNode* next
+        xmlNode* prev
+        xmlDoc* doc
+        xmlChar* content
+        xmlAttr* properties
+        xmlNs* ns
+        xmlNs* nsDef
+        unsigned short line
+
+    ctypedef struct xmlElementContent:
+        xmlElementContentType type
+        xmlElementContentOccur ocur
+        const_xmlChar *name
+        xmlElementContent *c1
+        xmlElementContent *c2
+        xmlElementContent *parent
+        const_xmlChar *prefix
+
+    ctypedef struct xmlEnumeration:
+        xmlEnumeration *next
+        const_xmlChar *name
+
+    ctypedef struct xmlAttribute:
+        void* _private
+        xmlElementType type
+        const_xmlChar* name
+        xmlNode* children
+        xmlNode* last
+        xmlDtd* parent
+        xmlNode* next
+        xmlNode* prev
+        xmlDoc* doc
+        xmlAttribute* nexth
+        xmlAttributeType atype
+        xmlAttributeDefault def_ "def"
+        const_xmlChar* defaultValue
+        xmlEnumeration* tree
+        const_xmlChar* prefix
+        const_xmlChar* elem
+
+    ctypedef struct xmlElement:
+        void* _private
+        xmlElementType   type
+        const_xmlChar* name
+        xmlNode* children
+        xmlNode* last
+        xmlNode* parent
+        xmlNode* next
+        xmlNode* prev
+        xmlDoc* doc
+        xmlElementTypeVal etype
+        xmlElementContent* content
+        xmlAttribute* attributes
+        const_xmlChar* prefix
+        void *contModel
+
+    ctypedef struct xmlEntity:
+        void* _private
+        xmlElementType type
+        const_xmlChar* name
+        xmlNode* children
+        xmlNode* last
+        xmlDtd* parent
+        xmlNode* next
+        xmlNode* prev
+        xmlDoc* doc
+        xmlChar* orig
+        xmlChar* content
+        int length
+        xmlEntityType etype
+        const_xmlChar* ExternalID
+        const_xmlChar* SystemID
+        xmlEntity* nexte
+        const_xmlChar* URI
+        int owner
+        int checked
+
+    ctypedef struct xmlDtd:
+        const_xmlChar* name
+        const_xmlChar* ExternalID
+        const_xmlChar* SystemID
+        void* notations
+        void* entities
+        void* pentities
+        void* attributes
+        void* elements
+        xmlNode* children
+        xmlNode* last
+        xmlDoc* doc
+
+    ctypedef struct xmlDoc:
+        xmlElementType type
+        char* name
+        xmlNode* children
+        xmlNode* last
+        xmlNode* parent
+        xmlNode* next
+        xmlNode* prev
+        xmlDoc* doc
+        xmlDict* dict
+        xmlHashTable* ids
+        int standalone
+        const_xmlChar* version
+        const_xmlChar* encoding
+        const_xmlChar* URL
+        void* _private
+        xmlDtd* intSubset
+        xmlDtd* extSubset
+        int properties
+
+    ctypedef struct xmlAttr:
+        void* _private
+        xmlElementType type
+        const_xmlChar* name
+        xmlNode* children
+        xmlNode* last
+        xmlNode* parent
+        xmlAttr* next
+        xmlAttr* prev
+        xmlDoc* doc
+        xmlNs* ns
+        xmlAttributeType atype
+
+    ctypedef struct xmlID:
+        const_xmlChar* value
+        const_xmlChar* name
+        xmlAttr* attr
+        xmlDoc* doc
+
+    ctypedef struct xmlBuffer
+
+    ctypedef struct xmlBuf   # new in libxml2 2.9
+
+    ctypedef struct xmlOutputBuffer:
+        xmlBuf* buffer
+        xmlBuf* conv
+        int error
+
+    const_xmlChar* XML_XML_NAMESPACE
+
+    cdef void xmlFreeDoc(xmlDoc* cur)
+    cdef void xmlFreeDtd(xmlDtd* cur)
+    cdef void xmlFreeNode(xmlNode* cur)
+    cdef void xmlFreeNsList(xmlNs* ns)
+    cdef void xmlFreeNs(xmlNs* ns)
+    cdef void xmlFree(void* buf)
+
+    cdef xmlNode* xmlNewNode(xmlNs* ns, const_xmlChar* name)
+    cdef xmlNode* xmlNewDocText(xmlDoc* doc, const_xmlChar* content)
+    cdef xmlNode* xmlNewDocComment(xmlDoc* doc, const_xmlChar* content)
+    cdef xmlNode* xmlNewDocPI(xmlDoc* doc, const_xmlChar* name, const_xmlChar* content)
+    cdef xmlNode* xmlNewReference(xmlDoc* doc, const_xmlChar* name)
+    cdef xmlNode* xmlNewCDataBlock(xmlDoc* doc, const_xmlChar* text, int len)
+    cdef xmlNs* xmlNewNs(xmlNode* node, const_xmlChar* href, const_xmlChar* prefix)
+    cdef xmlNode* xmlAddChild(xmlNode* parent, xmlNode* cur)
+    cdef xmlNode* xmlReplaceNode(xmlNode* old, xmlNode* cur)
+    cdef xmlNode* xmlAddPrevSibling(xmlNode* cur, xmlNode* elem)
+    cdef xmlNode* xmlAddNextSibling(xmlNode* cur, xmlNode* elem)
+    cdef xmlNode* xmlNewDocNode(xmlDoc* doc, xmlNs* ns,
+                                const_xmlChar* name, const_xmlChar* content)
+    cdef xmlDoc* xmlNewDoc(const_xmlChar* version)
+    cdef xmlAttr* xmlNewProp(xmlNode* node, const_xmlChar* name, const_xmlChar* value)
+    cdef xmlAttr* xmlNewNsProp(xmlNode* node, xmlNs* ns,
+                               const_xmlChar* name, const_xmlChar* value)
+    cdef xmlChar* xmlGetNoNsProp(xmlNode* node, const_xmlChar* name)
+    cdef xmlChar* xmlGetNsProp(xmlNode* node, const_xmlChar* name, const_xmlChar* nameSpace)
+    cdef void xmlSetNs(xmlNode* node, xmlNs* ns)
+    cdef xmlAttr* xmlSetProp(xmlNode* node, const_xmlChar* name, const_xmlChar* value)
+    cdef xmlAttr* xmlSetNsProp(xmlNode* node, xmlNs* ns,
+                               const_xmlChar* name, const_xmlChar* value)
+    cdef int xmlRemoveID(xmlDoc* doc, xmlAttr* cur)
+    cdef int xmlRemoveProp(xmlAttr* cur)
+    cdef void xmlFreePropList(xmlAttr* cur)
+    cdef xmlChar* xmlGetNodePath(xmlNode* node)
+    cdef void xmlDocDumpMemory(xmlDoc* cur, char** mem, int* size)
+    cdef void xmlDocDumpMemoryEnc(xmlDoc* cur, char** mem, int* size,
+                                  char* encoding)
+    cdef int xmlSaveFileTo(xmlOutputBuffer* out, xmlDoc* cur,
+                           char* encoding)
+
+    cdef void xmlUnlinkNode(xmlNode* cur)
+    cdef xmlNode* xmlDocSetRootElement(xmlDoc* doc, xmlNode* root)
+    cdef xmlNode* xmlDocGetRootElement(xmlDoc* doc)
+    cdef void xmlSetTreeDoc(xmlNode* tree, xmlDoc* doc)
+    cdef xmlAttr* xmlHasProp(xmlNode* node, const_xmlChar* name)
+    cdef xmlAttr* xmlHasNsProp(xmlNode* node, const_xmlChar* name, const_xmlChar* nameSpace)
+    cdef xmlChar* xmlNodeGetContent(xmlNode* cur)
+    cdef int xmlNodeBufGetContent(xmlBuffer* buffer, xmlNode* cur)
+    cdef xmlNs* xmlSearchNs(xmlDoc* doc, xmlNode* node, const_xmlChar* prefix)
+    cdef xmlNs* xmlSearchNsByHref(xmlDoc* doc, xmlNode* node, const_xmlChar* href)
+    cdef int xmlIsBlankNode(xmlNode* node)
+    cdef long xmlGetLineNo(xmlNode* node)
+    cdef void xmlElemDump(stdio.FILE* f, xmlDoc* doc, xmlNode* cur)
+    cdef void xmlNodeDumpOutput(xmlOutputBuffer* buf,
+                                xmlDoc* doc, xmlNode* cur, int level,
+                                int format, const_char* encoding)
+    cdef void xmlBufAttrSerializeTxtContent(xmlOutputBuffer *buf, xmlDoc *doc,
+                                xmlAttr *attr, const_xmlChar *string)
+    cdef void xmlNodeSetName(xmlNode* cur, const_xmlChar* name)
+    cdef void xmlNodeSetContent(xmlNode* cur, const_xmlChar* content)
+    cdef xmlDtd* xmlCopyDtd(xmlDtd* dtd)
+    cdef xmlDoc* xmlCopyDoc(xmlDoc* doc, int recursive)
+    cdef xmlNode* xmlCopyNode(xmlNode* node, int extended)
+    cdef xmlNode* xmlDocCopyNode(xmlNode* node, xmlDoc* doc, int extended)
+    cdef int xmlReconciliateNs(xmlDoc* doc, xmlNode* tree)
+    cdef xmlNs* xmlNewReconciliedNs(xmlDoc* doc, xmlNode* tree, xmlNs* ns)
+    cdef xmlBuffer* xmlBufferCreate()
+    cdef void xmlBufferWriteChar(xmlBuffer* buf, char* string)
+    cdef void xmlBufferFree(xmlBuffer* buf)
+    cdef const_xmlChar* xmlBufferContent(xmlBuffer* buf)
+    cdef int xmlBufferLength(xmlBuffer* buf)
+    cdef const_xmlChar* xmlBufContent(xmlBuf* buf) # new in libxml2 2.9
+    cdef size_t xmlBufUse(xmlBuf* buf) # new in libxml2 2.9
+    cdef int xmlKeepBlanksDefault(int val)
+    cdef xmlChar* xmlNodeGetBase(xmlDoc* doc, xmlNode* node)
+    cdef xmlDtd* xmlCreateIntSubset(xmlDoc* doc, const_xmlChar* name,
+                                    const_xmlChar* ExternalID, const_xmlChar* SystemID)
+    cdef void xmlNodeSetBase(xmlNode* node, const_xmlChar* uri)
+    cdef int xmlValidateNCName(const_xmlChar* value, int space)
+
+cdef extern from "libxml/uri.h" nogil:
+    cdef const_xmlChar* xmlBuildURI(const_xmlChar* href, const_xmlChar* base)
+
+cdef extern from "libxml/HTMLtree.h" nogil:
+    cdef void htmlNodeDumpFormatOutput(xmlOutputBuffer* buf,
+                                       xmlDoc* doc, xmlNode* cur,
+                                       char* encoding, int format)
+    cdef xmlDoc* htmlNewDoc(const_xmlChar* uri, const_xmlChar* externalID)
+
+cdef extern from "libxml/valid.h" nogil:
+    cdef xmlAttr* xmlGetID(xmlDoc* doc, const_xmlChar* ID)
+    cdef void xmlDumpNotationTable(xmlBuffer* buffer,
+                                   xmlNotationTable* table)
+    cdef int xmlValidateNameValue(const_xmlChar* value)
+
+cdef extern from "libxml/xmlIO.h":
+    cdef int xmlOutputBufferWrite(xmlOutputBuffer* out,
+                                  int len, const_char* str) nogil
+    cdef int xmlOutputBufferWriteString(xmlOutputBuffer* out, const_char* str) nogil
+    cdef int xmlOutputBufferWriteEscape(xmlOutputBuffer* out,
+                                        const_xmlChar* str,
+                                        xmlCharEncodingOutputFunc escapefunc) nogil
+    cdef int xmlOutputBufferFlush(xmlOutputBuffer* out) nogil
+    cdef int xmlOutputBufferClose(xmlOutputBuffer* out) nogil
+
+    ctypedef int (*xmlInputReadCallback)(void* context,
+                                         char* buffer, int len) noexcept nogil
+    ctypedef int (*xmlInputCloseCallback)(void* context) noexcept nogil
+
+    ctypedef int (*xmlOutputWriteCallback)(void* context,
+                                           char* buffer, int len) noexcept
+    ctypedef int (*xmlOutputCloseCallback)(void* context) noexcept
+
+    cdef xmlOutputBuffer* xmlAllocOutputBuffer(
+        xmlCharEncodingHandler* encoder) nogil
+    cdef xmlOutputBuffer* xmlOutputBufferCreateIO(
+        xmlOutputWriteCallback iowrite,
+        xmlOutputCloseCallback ioclose,
+        void * ioctx,
+        xmlCharEncodingHandler* encoder) nogil
+    cdef xmlOutputBuffer* xmlOutputBufferCreateFile(
+        stdio.FILE* file, xmlCharEncodingHandler* encoder) nogil
+    cdef xmlOutputBuffer* xmlOutputBufferCreateFilename(
+        char* URI, xmlCharEncodingHandler* encoder, int compression) nogil
+
+cdef extern from "libxml/xmlsave.h" nogil:
+    ctypedef struct xmlSaveCtxt
+
+    ctypedef enum xmlSaveOption:
+        XML_SAVE_FORMAT   = 1   # format save output            (2.6.17)
+        XML_SAVE_NO_DECL  = 2   # drop the xml declaration      (2.6.21)
+        XML_SAVE_NO_EMPTY = 4   # no empty tags                 (2.6.22)
+        XML_SAVE_NO_XHTML = 8   # disable XHTML1 specific rules (2.6.22)
+        XML_SAVE_XHTML = 16     # force XHTML1 specific rules         (2.7.2)
+        XML_SAVE_AS_XML = 32    # force XML serialization on HTML doc (2.7.2)
+        XML_SAVE_AS_HTML = 64   # force HTML serialization on XML doc (2.7.2)
+
+    cdef xmlSaveCtxt* xmlSaveToFilename(char* filename, char* encoding,
+                                        int options)
+    cdef xmlSaveCtxt* xmlSaveToBuffer(xmlBuffer* buffer, char* encoding,
+                                      int options) # libxml2 2.6.23
+    cdef long xmlSaveDoc(xmlSaveCtxt* ctxt, xmlDoc* doc)
+    cdef long xmlSaveTree(xmlSaveCtxt* ctxt, xmlNode* node)
+    cdef int xmlSaveClose(xmlSaveCtxt* ctxt)
+    cdef int xmlSaveFlush(xmlSaveCtxt* ctxt)
+    cdef int xmlSaveSetAttrEscape(xmlSaveCtxt* ctxt, void* escape_func)
+    cdef int xmlSaveSetEscape(xmlSaveCtxt* ctxt, void* escape_func)
+
+cdef extern from "libxml/globals.h" nogil:
+    cdef int xmlThrDefKeepBlanksDefaultValue(int onoff)
+    cdef int xmlThrDefLineNumbersDefaultValue(int onoff)
+    cdef int xmlThrDefIndentTreeOutput(int onoff)
+
+cdef extern from "libxml/xmlmemory.h" nogil:
+    cdef void* xmlMalloc(size_t size)
+    cdef int xmlMemBlocks()
+    cdef int xmlMemUsed()
+
+cdef extern from "etree_defs.h" nogil:
+    cdef bint _isElement(xmlNode* node)
+    cdef bint _isElementOrXInclude(xmlNode* node)
+    cdef const_xmlChar* _getNs(xmlNode* node)
+    cdef void BEGIN_FOR_EACH_ELEMENT_FROM(xmlNode* tree_top,
+                                          xmlNode* start_node,
+                                          bint inclusive)
+    cdef void END_FOR_EACH_ELEMENT_FROM(xmlNode* start_node)
+    cdef void BEGIN_FOR_EACH_FROM(xmlNode* tree_top,
+                                  xmlNode* start_node,
+                                  bint inclusive)
+    cdef void END_FOR_EACH_FROM(xmlNode* start_node)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/uri.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/uri.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..f886a54b9a9d7fc3094f99210f54edd98f12fab7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/uri.pxd
@@ -0,0 +1,5 @@
+cdef extern from "libxml/uri.h" nogil:
+    ctypedef struct xmlURI
+
+    cdef xmlURI* xmlParseURI(char* str)
+    cdef void xmlFreeURI(xmlURI* uri)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xinclude.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xinclude.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..68267175afa602f6bb0970566bd5f71f2d318af7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xinclude.pxd
@@ -0,0 +1,22 @@
+from lxml.includes.tree cimport xmlDoc, xmlNode
+
+cdef extern from "libxml/xinclude.h" nogil:
+
+    ctypedef struct xmlXIncludeCtxt
+
+    cdef int xmlXIncludeProcess(xmlDoc* doc)
+    cdef int xmlXIncludeProcessFlags(xmlDoc* doc, int parser_opts)
+    cdef int xmlXIncludeProcessTree(xmlNode* doc)
+    cdef int xmlXIncludeProcessTreeFlags(xmlNode* doc, int parser_opts)
+
+    # libxml2 >= 2.7.4
+    cdef int xmlXIncludeProcessTreeFlagsData(
+            xmlNode* doc, int parser_opts, void* data)
+
+    cdef xmlXIncludeCtxt* xmlXIncludeNewContext(xmlDoc* doc)
+    cdef int xmlXIncludeProcessNode(xmlXIncludeCtxt* ctxt, xmlNode* node)
+    cdef int xmlXIncludeSetFlags(xmlXIncludeCtxt* ctxt, int flags)
+
+    # libxml2 >= 2.6.27
+    cdef int xmlXIncludeProcessFlagsData(
+        xmlDoc* doc, int flags, void* data)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlerror.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlerror.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..0249a45e2c4b757f236ab5891510934f9add6b3e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlerror.pxd
@@ -0,0 +1,860 @@
+
+# --- BEGIN: GENERATED CONSTANTS ---
+
+# This section is generated by the script 'update-error-constants.py'.
+
+cdef extern from "libxml/xmlerror.h":
+    ctypedef enum xmlErrorLevel:
+        XML_ERR_NONE                                       =       0 # Success
+        XML_ERR_WARNING                                    =       1 # A warning
+        XML_ERR_ERROR                                      =       2 # An error
+        XML_ERR_FATAL                                      =       3 # A fatal error
+
+    ctypedef enum xmlErrorDomain:
+        XML_FROM_NONE                                      =       0 # Unknown
+        XML_FROM_PARSER                                    =       1 # The XML parser
+        XML_FROM_TREE                                      =       2 # The tree module (unused)
+        XML_FROM_NAMESPACE                                 =       3 # The XML Namespace module
+        XML_FROM_DTD                                       =       4 # The XML DTD validation with parser context
+        XML_FROM_HTML                                      =       5 # The HTML parser
+        XML_FROM_MEMORY                                    =       6 # The memory allocator (unused)
+        XML_FROM_OUTPUT                                    =       7 # The serialization code
+        XML_FROM_IO                                        =       8 # The Input/Output stack
+        XML_FROM_FTP                                       =       9 # The FTP module (unused)
+        XML_FROM_HTTP                                      =      10 # The HTTP module (unused)
+        XML_FROM_XINCLUDE                                  =      11 # The XInclude processing
+        XML_FROM_XPATH                                     =      12 # The XPath module
+        XML_FROM_XPOINTER                                  =      13 # The XPointer module
+        XML_FROM_REGEXP                                    =      14 # The regular expressions module
+        XML_FROM_DATATYPE                                  =      15 # The W3C XML Schemas Datatype module
+        XML_FROM_SCHEMASP                                  =      16 # The W3C XML Schemas parser module
+        XML_FROM_SCHEMASV                                  =      17 # The W3C XML Schemas validation module
+        XML_FROM_RELAXNGP                                  =      18 # The Relax-NG parser module
+        XML_FROM_RELAXNGV                                  =      19 # The Relax-NG validator module
+        XML_FROM_CATALOG                                   =      20 # The Catalog module
+        XML_FROM_C14N                                      =      21 # The Canonicalization module
+        XML_FROM_XSLT                                      =      22 # The XSLT engine from libxslt (unused)
+        XML_FROM_VALID                                     =      23 # The XML DTD validation with valid context
+        XML_FROM_CHECK                                     =      24 # The error checking module (unused)
+        XML_FROM_WRITER                                    =      25 # The xmlwriter module
+        XML_FROM_MODULE                                    =      26 # The dynamically loaded module module (unused)
+        XML_FROM_I18N                                      =      27 # The module handling character conversion (unused)
+        XML_FROM_SCHEMATRONV                               =      28 # The Schematron validator module
+        XML_FROM_BUFFER                                    =      29 # The buffers module (unused)
+        XML_FROM_URI                                       =      30 # The URI module (unused)
+
+    ctypedef enum xmlParserErrors:
+        XML_ERR_OK                                         =       0 # Success
+        XML_ERR_INTERNAL_ERROR                             =       1 # Internal assertion failure
+        XML_ERR_NO_MEMORY                                  =       2 # Out of memory
+        XML_ERR_DOCUMENT_START                             =       3
+        XML_ERR_DOCUMENT_EMPTY                             =       4
+        XML_ERR_DOCUMENT_END                               =       5
+        XML_ERR_INVALID_HEX_CHARREF                        =       6
+        XML_ERR_INVALID_DEC_CHARREF                        =       7
+        XML_ERR_INVALID_CHARREF                            =       8
+        XML_ERR_INVALID_CHAR                               =       9
+        XML_ERR_CHARREF_AT_EOF                             =      10
+        XML_ERR_CHARREF_IN_PROLOG                          =      11
+        XML_ERR_CHARREF_IN_EPILOG                          =      12
+        XML_ERR_CHARREF_IN_DTD                             =      13
+        XML_ERR_ENTITYREF_AT_EOF                           =      14
+        XML_ERR_ENTITYREF_IN_PROLOG                        =      15
+        XML_ERR_ENTITYREF_IN_EPILOG                        =      16
+        XML_ERR_ENTITYREF_IN_DTD                           =      17
+        XML_ERR_PEREF_AT_EOF                               =      18
+        XML_ERR_PEREF_IN_PROLOG                            =      19
+        XML_ERR_PEREF_IN_EPILOG                            =      20
+        XML_ERR_PEREF_IN_INT_SUBSET                        =      21
+        XML_ERR_ENTITYREF_NO_NAME                          =      22
+        XML_ERR_ENTITYREF_SEMICOL_MISSING                  =      23
+        XML_ERR_PEREF_NO_NAME                              =      24
+        XML_ERR_PEREF_SEMICOL_MISSING                      =      25
+        XML_ERR_UNDECLARED_ENTITY                          =      26
+        XML_WAR_UNDECLARED_ENTITY                          =      27
+        XML_ERR_UNPARSED_ENTITY                            =      28
+        XML_ERR_ENTITY_IS_EXTERNAL                         =      29
+        XML_ERR_ENTITY_IS_PARAMETER                        =      30
+        XML_ERR_UNKNOWN_ENCODING                           =      31
+        XML_ERR_UNSUPPORTED_ENCODING                       =      32 # Unsupported character encoding
+        XML_ERR_STRING_NOT_STARTED                         =      33
+        XML_ERR_STRING_NOT_CLOSED                          =      34
+        XML_ERR_NS_DECL_ERROR                              =      35
+        XML_ERR_ENTITY_NOT_STARTED                         =      36
+        XML_ERR_ENTITY_NOT_FINISHED                        =      37
+        XML_ERR_LT_IN_ATTRIBUTE                            =      38
+        XML_ERR_ATTRIBUTE_NOT_STARTED                      =      39
+        XML_ERR_ATTRIBUTE_NOT_FINISHED                     =      40
+        XML_ERR_ATTRIBUTE_WITHOUT_VALUE                    =      41
+        XML_ERR_ATTRIBUTE_REDEFINED                        =      42
+        XML_ERR_LITERAL_NOT_STARTED                        =      43
+        XML_ERR_LITERAL_NOT_FINISHED                       =      44
+        XML_ERR_COMMENT_NOT_FINISHED                       =      45
+        XML_ERR_PI_NOT_STARTED                             =      46
+        XML_ERR_PI_NOT_FINISHED                            =      47
+        XML_ERR_NOTATION_NOT_STARTED                       =      48
+        XML_ERR_NOTATION_NOT_FINISHED                      =      49
+        XML_ERR_ATTLIST_NOT_STARTED                        =      50
+        XML_ERR_ATTLIST_NOT_FINISHED                       =      51
+        XML_ERR_MIXED_NOT_STARTED                          =      52
+        XML_ERR_MIXED_NOT_FINISHED                         =      53
+        XML_ERR_ELEMCONTENT_NOT_STARTED                    =      54
+        XML_ERR_ELEMCONTENT_NOT_FINISHED                   =      55
+        XML_ERR_XMLDECL_NOT_STARTED                        =      56
+        XML_ERR_XMLDECL_NOT_FINISHED                       =      57
+        XML_ERR_CONDSEC_NOT_STARTED                        =      58
+        XML_ERR_CONDSEC_NOT_FINISHED                       =      59
+        XML_ERR_EXT_SUBSET_NOT_FINISHED                    =      60
+        XML_ERR_DOCTYPE_NOT_FINISHED                       =      61
+        XML_ERR_MISPLACED_CDATA_END                        =      62
+        XML_ERR_CDATA_NOT_FINISHED                         =      63
+        XML_ERR_RESERVED_XML_NAME                          =      64
+        XML_ERR_SPACE_REQUIRED                             =      65
+        XML_ERR_SEPARATOR_REQUIRED                         =      66
+        XML_ERR_NMTOKEN_REQUIRED                           =      67
+        XML_ERR_NAME_REQUIRED                              =      68
+        XML_ERR_PCDATA_REQUIRED                            =      69
+        XML_ERR_URI_REQUIRED                               =      70
+        XML_ERR_PUBID_REQUIRED                             =      71
+        XML_ERR_LT_REQUIRED                                =      72
+        XML_ERR_GT_REQUIRED                                =      73
+        XML_ERR_LTSLASH_REQUIRED                           =      74
+        XML_ERR_EQUAL_REQUIRED                             =      75
+        XML_ERR_TAG_NAME_MISMATCH                          =      76
+        XML_ERR_TAG_NOT_FINISHED                           =      77
+        XML_ERR_STANDALONE_VALUE                           =      78
+        XML_ERR_ENCODING_NAME                              =      79
+        XML_ERR_HYPHEN_IN_COMMENT                          =      80
+        XML_ERR_INVALID_ENCODING                           =      81
+        XML_ERR_EXT_ENTITY_STANDALONE                      =      82
+        XML_ERR_CONDSEC_INVALID                            =      83
+        XML_ERR_VALUE_REQUIRED                             =      84
+        XML_ERR_NOT_WELL_BALANCED                          =      85
+        XML_ERR_EXTRA_CONTENT                              =      86
+        XML_ERR_ENTITY_CHAR_ERROR                          =      87
+        XML_ERR_ENTITY_PE_INTERNAL                         =      88
+        XML_ERR_ENTITY_LOOP                                =      89
+        XML_ERR_ENTITY_BOUNDARY                            =      90
+        XML_ERR_INVALID_URI                                =      91
+        XML_ERR_URI_FRAGMENT                               =      92
+        XML_WAR_CATALOG_PI                                 =      93
+        XML_ERR_NO_DTD                                     =      94
+        XML_ERR_CONDSEC_INVALID_KEYWORD                    =      95
+        XML_ERR_VERSION_MISSING                            =      96
+        XML_WAR_UNKNOWN_VERSION                            =      97
+        XML_WAR_LANG_VALUE                                 =      98
+        XML_WAR_NS_URI                                     =      99
+        XML_WAR_NS_URI_RELATIVE                            =     100
+        XML_ERR_MISSING_ENCODING                           =     101
+        XML_WAR_SPACE_VALUE                                =     102
+        XML_ERR_NOT_STANDALONE                             =     103
+        XML_ERR_ENTITY_PROCESSING                          =     104
+        XML_ERR_NOTATION_PROCESSING                        =     105
+        XML_WAR_NS_COLUMN                                  =     106
+        XML_WAR_ENTITY_REDEFINED                           =     107
+        XML_ERR_UNKNOWN_VERSION                            =     108
+        XML_ERR_VERSION_MISMATCH                           =     109
+        XML_ERR_NAME_TOO_LONG                              =     110
+        XML_ERR_USER_STOP                                  =     111
+        XML_ERR_COMMENT_ABRUPTLY_ENDED                     =     112
+        XML_WAR_ENCODING_MISMATCH                          =     113
+        XML_ERR_RESOURCE_LIMIT                             =     114 # Internal resource limit like maximum amplification factor exceeded
+        XML_ERR_ARGUMENT                                   =     115 # Invalid argument
+        XML_ERR_SYSTEM                                     =     116 # Unexpected error from the OS or an external library
+        XML_ERR_REDECL_PREDEF_ENTITY                       =     117
+        XML_ERR_INT_SUBSET_NOT_FINISHED                    =     118
+        XML_NS_ERR_XML_NAMESPACE                           =     200
+        XML_NS_ERR_UNDEFINED_NAMESPACE                     =     201
+        XML_NS_ERR_QNAME                                   =     202
+        XML_NS_ERR_ATTRIBUTE_REDEFINED                     =     203
+        XML_NS_ERR_EMPTY                                   =     204
+        XML_NS_ERR_COLON                                   =     205
+        XML_DTD_ATTRIBUTE_DEFAULT                          =     500
+        XML_DTD_ATTRIBUTE_REDEFINED                        =     501
+        XML_DTD_ATTRIBUTE_VALUE                            =     502
+        XML_DTD_CONTENT_ERROR                              =     503
+        XML_DTD_CONTENT_MODEL                              =     504
+        XML_DTD_CONTENT_NOT_DETERMINIST                    =     505
+        XML_DTD_DIFFERENT_PREFIX                           =     506
+        XML_DTD_ELEM_DEFAULT_NAMESPACE                     =     507
+        XML_DTD_ELEM_NAMESPACE                             =     508
+        XML_DTD_ELEM_REDEFINED                             =     509
+        XML_DTD_EMPTY_NOTATION                             =     510
+        XML_DTD_ENTITY_TYPE                                =     511
+        XML_DTD_ID_FIXED                                   =     512
+        XML_DTD_ID_REDEFINED                               =     513
+        XML_DTD_ID_SUBSET                                  =     514
+        XML_DTD_INVALID_CHILD                              =     515
+        XML_DTD_INVALID_DEFAULT                            =     516
+        XML_DTD_LOAD_ERROR                                 =     517
+        XML_DTD_MISSING_ATTRIBUTE                          =     518
+        XML_DTD_MIXED_CORRUPT                              =     519
+        XML_DTD_MULTIPLE_ID                                =     520
+        XML_DTD_NO_DOC                                     =     521
+        XML_DTD_NO_DTD                                     =     522
+        XML_DTD_NO_ELEM_NAME                               =     523
+        XML_DTD_NO_PREFIX                                  =     524
+        XML_DTD_NO_ROOT                                    =     525
+        XML_DTD_NOTATION_REDEFINED                         =     526
+        XML_DTD_NOTATION_VALUE                             =     527
+        XML_DTD_NOT_EMPTY                                  =     528
+        XML_DTD_NOT_PCDATA                                 =     529
+        XML_DTD_NOT_STANDALONE                             =     530
+        XML_DTD_ROOT_NAME                                  =     531
+        XML_DTD_STANDALONE_WHITE_SPACE                     =     532
+        XML_DTD_UNKNOWN_ATTRIBUTE                          =     533
+        XML_DTD_UNKNOWN_ELEM                               =     534
+        XML_DTD_UNKNOWN_ENTITY                             =     535
+        XML_DTD_UNKNOWN_ID                                 =     536
+        XML_DTD_UNKNOWN_NOTATION                           =     537
+        XML_DTD_STANDALONE_DEFAULTED                       =     538
+        XML_DTD_XMLID_VALUE                                =     539
+        XML_DTD_XMLID_TYPE                                 =     540
+        XML_DTD_DUP_TOKEN                                  =     541
+        XML_HTML_STRUCURE_ERROR                            =     800
+        XML_HTML_UNKNOWN_TAG                               =     801
+        XML_HTML_INCORRECTLY_OPENED_COMMENT                =     802
+        XML_RNGP_ANYNAME_ATTR_ANCESTOR                     =    1000
+        XML_RNGP_ATTR_CONFLICT                             =    1001
+        XML_RNGP_ATTRIBUTE_CHILDREN                        =    1002
+        XML_RNGP_ATTRIBUTE_CONTENT                         =    1003
+        XML_RNGP_ATTRIBUTE_EMPTY                           =    1004
+        XML_RNGP_ATTRIBUTE_NOOP                            =    1005
+        XML_RNGP_CHOICE_CONTENT                            =    1006
+        XML_RNGP_CHOICE_EMPTY                              =    1007
+        XML_RNGP_CREATE_FAILURE                            =    1008
+        XML_RNGP_DATA_CONTENT                              =    1009
+        XML_RNGP_DEF_CHOICE_AND_INTERLEAVE                 =    1010
+        XML_RNGP_DEFINE_CREATE_FAILED                      =    1011
+        XML_RNGP_DEFINE_EMPTY                              =    1012
+        XML_RNGP_DEFINE_MISSING                            =    1013
+        XML_RNGP_DEFINE_NAME_MISSING                       =    1014
+        XML_RNGP_ELEM_CONTENT_EMPTY                        =    1015
+        XML_RNGP_ELEM_CONTENT_ERROR                        =    1016
+        XML_RNGP_ELEMENT_EMPTY                             =    1017
+        XML_RNGP_ELEMENT_CONTENT                           =    1018
+        XML_RNGP_ELEMENT_NAME                              =    1019
+        XML_RNGP_ELEMENT_NO_CONTENT                        =    1020
+        XML_RNGP_ELEM_TEXT_CONFLICT                        =    1021
+        XML_RNGP_EMPTY                                     =    1022
+        XML_RNGP_EMPTY_CONSTRUCT                           =    1023
+        XML_RNGP_EMPTY_CONTENT                             =    1024
+        XML_RNGP_EMPTY_NOT_EMPTY                           =    1025
+        XML_RNGP_ERROR_TYPE_LIB                            =    1026
+        XML_RNGP_EXCEPT_EMPTY                              =    1027
+        XML_RNGP_EXCEPT_MISSING                            =    1028
+        XML_RNGP_EXCEPT_MULTIPLE                           =    1029
+        XML_RNGP_EXCEPT_NO_CONTENT                         =    1030
+        XML_RNGP_EXTERNALREF_EMTPY                         =    1031
+        XML_RNGP_EXTERNAL_REF_FAILURE                      =    1032
+        XML_RNGP_EXTERNALREF_RECURSE                       =    1033
+        XML_RNGP_FORBIDDEN_ATTRIBUTE                       =    1034
+        XML_RNGP_FOREIGN_ELEMENT                           =    1035
+        XML_RNGP_GRAMMAR_CONTENT                           =    1036
+        XML_RNGP_GRAMMAR_EMPTY                             =    1037
+        XML_RNGP_GRAMMAR_MISSING                           =    1038
+        XML_RNGP_GRAMMAR_NO_START                          =    1039
+        XML_RNGP_GROUP_ATTR_CONFLICT                       =    1040
+        XML_RNGP_HREF_ERROR                                =    1041
+        XML_RNGP_INCLUDE_EMPTY                             =    1042
+        XML_RNGP_INCLUDE_FAILURE                           =    1043
+        XML_RNGP_INCLUDE_RECURSE                           =    1044
+        XML_RNGP_INTERLEAVE_ADD                            =    1045
+        XML_RNGP_INTERLEAVE_CREATE_FAILED                  =    1046
+        XML_RNGP_INTERLEAVE_EMPTY                          =    1047
+        XML_RNGP_INTERLEAVE_NO_CONTENT                     =    1048
+        XML_RNGP_INVALID_DEFINE_NAME                       =    1049
+        XML_RNGP_INVALID_URI                               =    1050
+        XML_RNGP_INVALID_VALUE                             =    1051
+        XML_RNGP_MISSING_HREF                              =    1052
+        XML_RNGP_NAME_MISSING                              =    1053
+        XML_RNGP_NEED_COMBINE                              =    1054
+        XML_RNGP_NOTALLOWED_NOT_EMPTY                      =    1055
+        XML_RNGP_NSNAME_ATTR_ANCESTOR                      =    1056
+        XML_RNGP_NSNAME_NO_NS                              =    1057
+        XML_RNGP_PARAM_FORBIDDEN                           =    1058
+        XML_RNGP_PARAM_NAME_MISSING                        =    1059
+        XML_RNGP_PARENTREF_CREATE_FAILED                   =    1060
+        XML_RNGP_PARENTREF_NAME_INVALID                    =    1061
+        XML_RNGP_PARENTREF_NO_NAME                         =    1062
+        XML_RNGP_PARENTREF_NO_PARENT                       =    1063
+        XML_RNGP_PARENTREF_NOT_EMPTY                       =    1064
+        XML_RNGP_PARSE_ERROR                               =    1065
+        XML_RNGP_PAT_ANYNAME_EXCEPT_ANYNAME                =    1066
+        XML_RNGP_PAT_ATTR_ATTR                             =    1067
+        XML_RNGP_PAT_ATTR_ELEM                             =    1068
+        XML_RNGP_PAT_DATA_EXCEPT_ATTR                      =    1069
+        XML_RNGP_PAT_DATA_EXCEPT_ELEM                      =    1070
+        XML_RNGP_PAT_DATA_EXCEPT_EMPTY                     =    1071
+        XML_RNGP_PAT_DATA_EXCEPT_GROUP                     =    1072
+        XML_RNGP_PAT_DATA_EXCEPT_INTERLEAVE                =    1073
+        XML_RNGP_PAT_DATA_EXCEPT_LIST                      =    1074
+        XML_RNGP_PAT_DATA_EXCEPT_ONEMORE                   =    1075
+        XML_RNGP_PAT_DATA_EXCEPT_REF                       =    1076
+        XML_RNGP_PAT_DATA_EXCEPT_TEXT                      =    1077
+        XML_RNGP_PAT_LIST_ATTR                             =    1078
+        XML_RNGP_PAT_LIST_ELEM                             =    1079
+        XML_RNGP_PAT_LIST_INTERLEAVE                       =    1080
+        XML_RNGP_PAT_LIST_LIST                             =    1081
+        XML_RNGP_PAT_LIST_REF                              =    1082
+        XML_RNGP_PAT_LIST_TEXT                             =    1083
+        XML_RNGP_PAT_NSNAME_EXCEPT_ANYNAME                 =    1084
+        XML_RNGP_PAT_NSNAME_EXCEPT_NSNAME                  =    1085
+        XML_RNGP_PAT_ONEMORE_GROUP_ATTR                    =    1086
+        XML_RNGP_PAT_ONEMORE_INTERLEAVE_ATTR               =    1087
+        XML_RNGP_PAT_START_ATTR                            =    1088
+        XML_RNGP_PAT_START_DATA                            =    1089
+        XML_RNGP_PAT_START_EMPTY                           =    1090
+        XML_RNGP_PAT_START_GROUP                           =    1091
+        XML_RNGP_PAT_START_INTERLEAVE                      =    1092
+        XML_RNGP_PAT_START_LIST                            =    1093
+        XML_RNGP_PAT_START_ONEMORE                         =    1094
+        XML_RNGP_PAT_START_TEXT                            =    1095
+        XML_RNGP_PAT_START_VALUE                           =    1096
+        XML_RNGP_PREFIX_UNDEFINED                          =    1097
+        XML_RNGP_REF_CREATE_FAILED                         =    1098
+        XML_RNGP_REF_CYCLE                                 =    1099
+        XML_RNGP_REF_NAME_INVALID                          =    1100
+        XML_RNGP_REF_NO_DEF                                =    1101
+        XML_RNGP_REF_NO_NAME                               =    1102
+        XML_RNGP_REF_NOT_EMPTY                             =    1103
+        XML_RNGP_START_CHOICE_AND_INTERLEAVE               =    1104
+        XML_RNGP_START_CONTENT                             =    1105
+        XML_RNGP_START_EMPTY                               =    1106
+        XML_RNGP_START_MISSING                             =    1107
+        XML_RNGP_TEXT_EXPECTED                             =    1108
+        XML_RNGP_TEXT_HAS_CHILD                            =    1109
+        XML_RNGP_TYPE_MISSING                              =    1110
+        XML_RNGP_TYPE_NOT_FOUND                            =    1111
+        XML_RNGP_TYPE_VALUE                                =    1112
+        XML_RNGP_UNKNOWN_ATTRIBUTE                         =    1113
+        XML_RNGP_UNKNOWN_COMBINE                           =    1114
+        XML_RNGP_UNKNOWN_CONSTRUCT                         =    1115
+        XML_RNGP_UNKNOWN_TYPE_LIB                          =    1116
+        XML_RNGP_URI_FRAGMENT                              =    1117
+        XML_RNGP_URI_NOT_ABSOLUTE                          =    1118
+        XML_RNGP_VALUE_EMPTY                               =    1119
+        XML_RNGP_VALUE_NO_CONTENT                          =    1120
+        XML_RNGP_XMLNS_NAME                                =    1121
+        XML_RNGP_XML_NS                                    =    1122
+        XML_XPATH_EXPRESSION_OK                            =    1200
+        XML_XPATH_NUMBER_ERROR                             =    1201
+        XML_XPATH_UNFINISHED_LITERAL_ERROR                 =    1202
+        XML_XPATH_START_LITERAL_ERROR                      =    1203
+        XML_XPATH_VARIABLE_REF_ERROR                       =    1204
+        XML_XPATH_UNDEF_VARIABLE_ERROR                     =    1205
+        XML_XPATH_INVALID_PREDICATE_ERROR                  =    1206
+        XML_XPATH_EXPR_ERROR                               =    1207
+        XML_XPATH_UNCLOSED_ERROR                           =    1208
+        XML_XPATH_UNKNOWN_FUNC_ERROR                       =    1209
+        XML_XPATH_INVALID_OPERAND                          =    1210
+        XML_XPATH_INVALID_TYPE                             =    1211
+        XML_XPATH_INVALID_ARITY                            =    1212
+        XML_XPATH_INVALID_CTXT_SIZE                        =    1213
+        XML_XPATH_INVALID_CTXT_POSITION                    =    1214
+        XML_XPATH_MEMORY_ERROR                             =    1215
+        XML_XPTR_SYNTAX_ERROR                              =    1216
+        XML_XPTR_RESOURCE_ERROR                            =    1217
+        XML_XPTR_SUB_RESOURCE_ERROR                        =    1218
+        XML_XPATH_UNDEF_PREFIX_ERROR                       =    1219
+        XML_XPATH_ENCODING_ERROR                           =    1220
+        XML_XPATH_INVALID_CHAR_ERROR                       =    1221
+        XML_TREE_INVALID_HEX                               =    1300
+        XML_TREE_INVALID_DEC                               =    1301
+        XML_TREE_UNTERMINATED_ENTITY                       =    1302
+        XML_TREE_NOT_UTF8                                  =    1303
+        XML_SAVE_NOT_UTF8                                  =    1400
+        XML_SAVE_CHAR_INVALID                              =    1401
+        XML_SAVE_NO_DOCTYPE                                =    1402
+        XML_SAVE_UNKNOWN_ENCODING                          =    1403
+        XML_REGEXP_COMPILE_ERROR                           =    1450
+        XML_IO_UNKNOWN                                     =    1500
+        XML_IO_EACCES                                      =    1501
+        XML_IO_EAGAIN                                      =    1502
+        XML_IO_EBADF                                       =    1503
+        XML_IO_EBADMSG                                     =    1504
+        XML_IO_EBUSY                                       =    1505
+        XML_IO_ECANCELED                                   =    1506
+        XML_IO_ECHILD                                      =    1507
+        XML_IO_EDEADLK                                     =    1508
+        XML_IO_EDOM                                        =    1509
+        XML_IO_EEXIST                                      =    1510
+        XML_IO_EFAULT                                      =    1511
+        XML_IO_EFBIG                                       =    1512
+        XML_IO_EINPROGRESS                                 =    1513
+        XML_IO_EINTR                                       =    1514
+        XML_IO_EINVAL                                      =    1515
+        XML_IO_EIO                                         =    1516
+        XML_IO_EISDIR                                      =    1517
+        XML_IO_EMFILE                                      =    1518
+        XML_IO_EMLINK                                      =    1519
+        XML_IO_EMSGSIZE                                    =    1520
+        XML_IO_ENAMETOOLONG                                =    1521
+        XML_IO_ENFILE                                      =    1522
+        XML_IO_ENODEV                                      =    1523
+        XML_IO_ENOENT                                      =    1524 # File not found
+        XML_IO_ENOEXEC                                     =    1525
+        XML_IO_ENOLCK                                      =    1526
+        XML_IO_ENOMEM                                      =    1527
+        XML_IO_ENOSPC                                      =    1528
+        XML_IO_ENOSYS                                      =    1529
+        XML_IO_ENOTDIR                                     =    1530
+        XML_IO_ENOTEMPTY                                   =    1531
+        XML_IO_ENOTSUP                                     =    1532
+        XML_IO_ENOTTY                                      =    1533
+        XML_IO_ENXIO                                       =    1534
+        XML_IO_EPERM                                       =    1535
+        XML_IO_EPIPE                                       =    1536
+        XML_IO_ERANGE                                      =    1537
+        XML_IO_EROFS                                       =    1538
+        XML_IO_ESPIPE                                      =    1539
+        XML_IO_ESRCH                                       =    1540
+        XML_IO_ETIMEDOUT                                   =    1541
+        XML_IO_EXDEV                                       =    1542
+        XML_IO_NETWORK_ATTEMPT                             =    1543
+        XML_IO_ENCODER                                     =    1544
+        XML_IO_FLUSH                                       =    1545
+        XML_IO_WRITE                                       =    1546
+        XML_IO_NO_INPUT                                    =    1547
+        XML_IO_BUFFER_FULL                                 =    1548
+        XML_IO_LOAD_ERROR                                  =    1549
+        XML_IO_ENOTSOCK                                    =    1550
+        XML_IO_EISCONN                                     =    1551
+        XML_IO_ECONNREFUSED                                =    1552
+        XML_IO_ENETUNREACH                                 =    1553
+        XML_IO_EADDRINUSE                                  =    1554
+        XML_IO_EALREADY                                    =    1555
+        XML_IO_EAFNOSUPPORT                                =    1556
+        XML_IO_UNSUPPORTED_PROTOCOL                        =    1557
+        XML_XINCLUDE_RECURSION                             =    1600
+        XML_XINCLUDE_PARSE_VALUE                           =    1601
+        XML_XINCLUDE_ENTITY_DEF_MISMATCH                   =    1602
+        XML_XINCLUDE_NO_HREF                               =    1603
+        XML_XINCLUDE_NO_FALLBACK                           =    1604
+        XML_XINCLUDE_HREF_URI                              =    1605
+        XML_XINCLUDE_TEXT_FRAGMENT                         =    1606
+        XML_XINCLUDE_TEXT_DOCUMENT                         =    1607
+        XML_XINCLUDE_INVALID_CHAR                          =    1608
+        XML_XINCLUDE_BUILD_FAILED                          =    1609
+        XML_XINCLUDE_UNKNOWN_ENCODING                      =    1610
+        XML_XINCLUDE_MULTIPLE_ROOT                         =    1611
+        XML_XINCLUDE_XPTR_FAILED                           =    1612
+        XML_XINCLUDE_XPTR_RESULT                           =    1613
+        XML_XINCLUDE_INCLUDE_IN_INCLUDE                    =    1614
+        XML_XINCLUDE_FALLBACKS_IN_INCLUDE                  =    1615
+        XML_XINCLUDE_FALLBACK_NOT_IN_INCLUDE               =    1616
+        XML_XINCLUDE_DEPRECATED_NS                         =    1617
+        XML_XINCLUDE_FRAGMENT_ID                           =    1618
+        XML_CATALOG_MISSING_ATTR                           =    1650
+        XML_CATALOG_ENTRY_BROKEN                           =    1651
+        XML_CATALOG_PREFER_VALUE                           =    1652
+        XML_CATALOG_NOT_CATALOG                            =    1653
+        XML_CATALOG_RECURSION                              =    1654
+        XML_SCHEMAP_PREFIX_UNDEFINED                       =    1700
+        XML_SCHEMAP_ATTRFORMDEFAULT_VALUE                  =    1701
+        XML_SCHEMAP_ATTRGRP_NONAME_NOREF                   =    1702
+        XML_SCHEMAP_ATTR_NONAME_NOREF                      =    1703
+        XML_SCHEMAP_COMPLEXTYPE_NONAME_NOREF               =    1704
+        XML_SCHEMAP_ELEMFORMDEFAULT_VALUE                  =    1705
+        XML_SCHEMAP_ELEM_NONAME_NOREF                      =    1706
+        XML_SCHEMAP_EXTENSION_NO_BASE                      =    1707
+        XML_SCHEMAP_FACET_NO_VALUE                         =    1708
+        XML_SCHEMAP_FAILED_BUILD_IMPORT                    =    1709
+        XML_SCHEMAP_GROUP_NONAME_NOREF                     =    1710
+        XML_SCHEMAP_IMPORT_NAMESPACE_NOT_URI               =    1711
+        XML_SCHEMAP_IMPORT_REDEFINE_NSNAME                 =    1712
+        XML_SCHEMAP_IMPORT_SCHEMA_NOT_URI                  =    1713
+        XML_SCHEMAP_INVALID_BOOLEAN                        =    1714
+        XML_SCHEMAP_INVALID_ENUM                           =    1715
+        XML_SCHEMAP_INVALID_FACET                          =    1716
+        XML_SCHEMAP_INVALID_FACET_VALUE                    =    1717
+        XML_SCHEMAP_INVALID_MAXOCCURS                      =    1718
+        XML_SCHEMAP_INVALID_MINOCCURS                      =    1719
+        XML_SCHEMAP_INVALID_REF_AND_SUBTYPE                =    1720
+        XML_SCHEMAP_INVALID_WHITE_SPACE                    =    1721
+        XML_SCHEMAP_NOATTR_NOREF                           =    1722
+        XML_SCHEMAP_NOTATION_NO_NAME                       =    1723
+        XML_SCHEMAP_NOTYPE_NOREF                           =    1724
+        XML_SCHEMAP_REF_AND_SUBTYPE                        =    1725
+        XML_SCHEMAP_RESTRICTION_NONAME_NOREF               =    1726
+        XML_SCHEMAP_SIMPLETYPE_NONAME                      =    1727
+        XML_SCHEMAP_TYPE_AND_SUBTYPE                       =    1728
+        XML_SCHEMAP_UNKNOWN_ALL_CHILD                      =    1729
+        XML_SCHEMAP_UNKNOWN_ANYATTRIBUTE_CHILD             =    1730
+        XML_SCHEMAP_UNKNOWN_ATTR_CHILD                     =    1731
+        XML_SCHEMAP_UNKNOWN_ATTRGRP_CHILD                  =    1732
+        XML_SCHEMAP_UNKNOWN_ATTRIBUTE_GROUP                =    1733
+        XML_SCHEMAP_UNKNOWN_BASE_TYPE                      =    1734
+        XML_SCHEMAP_UNKNOWN_CHOICE_CHILD                   =    1735
+        XML_SCHEMAP_UNKNOWN_COMPLEXCONTENT_CHILD           =    1736
+        XML_SCHEMAP_UNKNOWN_COMPLEXTYPE_CHILD              =    1737
+        XML_SCHEMAP_UNKNOWN_ELEM_CHILD                     =    1738
+        XML_SCHEMAP_UNKNOWN_EXTENSION_CHILD                =    1739
+        XML_SCHEMAP_UNKNOWN_FACET_CHILD                    =    1740
+        XML_SCHEMAP_UNKNOWN_FACET_TYPE                     =    1741
+        XML_SCHEMAP_UNKNOWN_GROUP_CHILD                    =    1742
+        XML_SCHEMAP_UNKNOWN_IMPORT_CHILD                   =    1743
+        XML_SCHEMAP_UNKNOWN_LIST_CHILD                     =    1744
+        XML_SCHEMAP_UNKNOWN_NOTATION_CHILD                 =    1745
+        XML_SCHEMAP_UNKNOWN_PROCESSCONTENT_CHILD           =    1746
+        XML_SCHEMAP_UNKNOWN_REF                            =    1747
+        XML_SCHEMAP_UNKNOWN_RESTRICTION_CHILD              =    1748
+        XML_SCHEMAP_UNKNOWN_SCHEMAS_CHILD                  =    1749
+        XML_SCHEMAP_UNKNOWN_SEQUENCE_CHILD                 =    1750
+        XML_SCHEMAP_UNKNOWN_SIMPLECONTENT_CHILD            =    1751
+        XML_SCHEMAP_UNKNOWN_SIMPLETYPE_CHILD               =    1752
+        XML_SCHEMAP_UNKNOWN_TYPE                           =    1753
+        XML_SCHEMAP_UNKNOWN_UNION_CHILD                    =    1754
+        XML_SCHEMAP_ELEM_DEFAULT_FIXED                     =    1755
+        XML_SCHEMAP_REGEXP_INVALID                         =    1756
+        XML_SCHEMAP_FAILED_LOAD                            =    1757
+        XML_SCHEMAP_NOTHING_TO_PARSE                       =    1758
+        XML_SCHEMAP_NOROOT                                 =    1759
+        XML_SCHEMAP_REDEFINED_GROUP                        =    1760
+        XML_SCHEMAP_REDEFINED_TYPE                         =    1761
+        XML_SCHEMAP_REDEFINED_ELEMENT                      =    1762
+        XML_SCHEMAP_REDEFINED_ATTRGROUP                    =    1763
+        XML_SCHEMAP_REDEFINED_ATTR                         =    1764
+        XML_SCHEMAP_REDEFINED_NOTATION                     =    1765
+        XML_SCHEMAP_FAILED_PARSE                           =    1766
+        XML_SCHEMAP_UNKNOWN_PREFIX                         =    1767
+        XML_SCHEMAP_DEF_AND_PREFIX                         =    1768
+        XML_SCHEMAP_UNKNOWN_INCLUDE_CHILD                  =    1769
+        XML_SCHEMAP_INCLUDE_SCHEMA_NOT_URI                 =    1770
+        XML_SCHEMAP_INCLUDE_SCHEMA_NO_URI                  =    1771
+        XML_SCHEMAP_NOT_SCHEMA                             =    1772
+        XML_SCHEMAP_UNKNOWN_MEMBER_TYPE                    =    1773
+        XML_SCHEMAP_INVALID_ATTR_USE                       =    1774
+        XML_SCHEMAP_RECURSIVE                              =    1775
+        XML_SCHEMAP_SUPERNUMEROUS_LIST_ITEM_TYPE           =    1776
+        XML_SCHEMAP_INVALID_ATTR_COMBINATION               =    1777
+        XML_SCHEMAP_INVALID_ATTR_INLINE_COMBINATION        =    1778
+        XML_SCHEMAP_MISSING_SIMPLETYPE_CHILD               =    1779
+        XML_SCHEMAP_INVALID_ATTR_NAME                      =    1780
+        XML_SCHEMAP_REF_AND_CONTENT                        =    1781
+        XML_SCHEMAP_CT_PROPS_CORRECT_1                     =    1782
+        XML_SCHEMAP_CT_PROPS_CORRECT_2                     =    1783
+        XML_SCHEMAP_CT_PROPS_CORRECT_3                     =    1784
+        XML_SCHEMAP_CT_PROPS_CORRECT_4                     =    1785
+        XML_SCHEMAP_CT_PROPS_CORRECT_5                     =    1786
+        XML_SCHEMAP_DERIVATION_OK_RESTRICTION_1            =    1787
+        XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_1        =    1788
+        XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_2        =    1789
+        XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_2          =    1790
+        XML_SCHEMAP_DERIVATION_OK_RESTRICTION_3            =    1791
+        XML_SCHEMAP_WILDCARD_INVALID_NS_MEMBER             =    1792
+        XML_SCHEMAP_INTERSECTION_NOT_EXPRESSIBLE           =    1793
+        XML_SCHEMAP_UNION_NOT_EXPRESSIBLE                  =    1794
+        XML_SCHEMAP_SRC_IMPORT_3_1                         =    1795
+        XML_SCHEMAP_SRC_IMPORT_3_2                         =    1796
+        XML_SCHEMAP_DERIVATION_OK_RESTRICTION_4_1          =    1797
+        XML_SCHEMAP_DERIVATION_OK_RESTRICTION_4_2          =    1798
+        XML_SCHEMAP_DERIVATION_OK_RESTRICTION_4_3          =    1799
+        XML_SCHEMAP_COS_CT_EXTENDS_1_3                     =    1800
+        XML_SCHEMAV_NOROOT                                 =    1801
+        XML_SCHEMAV_UNDECLAREDELEM                         =    1802
+        XML_SCHEMAV_NOTTOPLEVEL                            =    1803
+        XML_SCHEMAV_MISSING                                =    1804
+        XML_SCHEMAV_WRONGELEM                              =    1805
+        XML_SCHEMAV_NOTYPE                                 =    1806
+        XML_SCHEMAV_NOROLLBACK                             =    1807
+        XML_SCHEMAV_ISABSTRACT                             =    1808
+        XML_SCHEMAV_NOTEMPTY                               =    1809
+        XML_SCHEMAV_ELEMCONT                               =    1810
+        XML_SCHEMAV_HAVEDEFAULT                            =    1811
+        XML_SCHEMAV_NOTNILLABLE                            =    1812
+        XML_SCHEMAV_EXTRACONTENT                           =    1813
+        XML_SCHEMAV_INVALIDATTR                            =    1814
+        XML_SCHEMAV_INVALIDELEM                            =    1815
+        XML_SCHEMAV_NOTDETERMINIST                         =    1816
+        XML_SCHEMAV_CONSTRUCT                              =    1817
+        XML_SCHEMAV_INTERNAL                               =    1818
+        XML_SCHEMAV_NOTSIMPLE                              =    1819
+        XML_SCHEMAV_ATTRUNKNOWN                            =    1820
+        XML_SCHEMAV_ATTRINVALID                            =    1821
+        XML_SCHEMAV_VALUE                                  =    1822
+        XML_SCHEMAV_FACET                                  =    1823
+        XML_SCHEMAV_CVC_DATATYPE_VALID_1_2_1               =    1824
+        XML_SCHEMAV_CVC_DATATYPE_VALID_1_2_2               =    1825
+        XML_SCHEMAV_CVC_DATATYPE_VALID_1_2_3               =    1826
+        XML_SCHEMAV_CVC_TYPE_3_1_1                         =    1827
+        XML_SCHEMAV_CVC_TYPE_3_1_2                         =    1828
+        XML_SCHEMAV_CVC_FACET_VALID                        =    1829
+        XML_SCHEMAV_CVC_LENGTH_VALID                       =    1830
+        XML_SCHEMAV_CVC_MINLENGTH_VALID                    =    1831
+        XML_SCHEMAV_CVC_MAXLENGTH_VALID                    =    1832
+        XML_SCHEMAV_CVC_MININCLUSIVE_VALID                 =    1833
+        XML_SCHEMAV_CVC_MAXINCLUSIVE_VALID                 =    1834
+        XML_SCHEMAV_CVC_MINEXCLUSIVE_VALID                 =    1835
+        XML_SCHEMAV_CVC_MAXEXCLUSIVE_VALID                 =    1836
+        XML_SCHEMAV_CVC_TOTALDIGITS_VALID                  =    1837
+        XML_SCHEMAV_CVC_FRACTIONDIGITS_VALID               =    1838
+        XML_SCHEMAV_CVC_PATTERN_VALID                      =    1839
+        XML_SCHEMAV_CVC_ENUMERATION_VALID                  =    1840
+        XML_SCHEMAV_CVC_COMPLEX_TYPE_2_1                   =    1841
+        XML_SCHEMAV_CVC_COMPLEX_TYPE_2_2                   =    1842
+        XML_SCHEMAV_CVC_COMPLEX_TYPE_2_3                   =    1843
+        XML_SCHEMAV_CVC_COMPLEX_TYPE_2_4                   =    1844
+        XML_SCHEMAV_CVC_ELT_1                              =    1845
+        XML_SCHEMAV_CVC_ELT_2                              =    1846
+        XML_SCHEMAV_CVC_ELT_3_1                            =    1847
+        XML_SCHEMAV_CVC_ELT_3_2_1                          =    1848
+        XML_SCHEMAV_CVC_ELT_3_2_2                          =    1849
+        XML_SCHEMAV_CVC_ELT_4_1                            =    1850
+        XML_SCHEMAV_CVC_ELT_4_2                            =    1851
+        XML_SCHEMAV_CVC_ELT_4_3                            =    1852
+        XML_SCHEMAV_CVC_ELT_5_1_1                          =    1853
+        XML_SCHEMAV_CVC_ELT_5_1_2                          =    1854
+        XML_SCHEMAV_CVC_ELT_5_2_1                          =    1855
+        XML_SCHEMAV_CVC_ELT_5_2_2_1                        =    1856
+        XML_SCHEMAV_CVC_ELT_5_2_2_2_1                      =    1857
+        XML_SCHEMAV_CVC_ELT_5_2_2_2_2                      =    1858
+        XML_SCHEMAV_CVC_ELT_6                              =    1859
+        XML_SCHEMAV_CVC_ELT_7                              =    1860
+        XML_SCHEMAV_CVC_ATTRIBUTE_1                        =    1861
+        XML_SCHEMAV_CVC_ATTRIBUTE_2                        =    1862
+        XML_SCHEMAV_CVC_ATTRIBUTE_3                        =    1863
+        XML_SCHEMAV_CVC_ATTRIBUTE_4                        =    1864
+        XML_SCHEMAV_CVC_COMPLEX_TYPE_3_1                   =    1865
+        XML_SCHEMAV_CVC_COMPLEX_TYPE_3_2_1                 =    1866
+        XML_SCHEMAV_CVC_COMPLEX_TYPE_3_2_2                 =    1867
+        XML_SCHEMAV_CVC_COMPLEX_TYPE_4                     =    1868
+        XML_SCHEMAV_CVC_COMPLEX_TYPE_5_1                   =    1869
+        XML_SCHEMAV_CVC_COMPLEX_TYPE_5_2                   =    1870
+        XML_SCHEMAV_ELEMENT_CONTENT                        =    1871
+        XML_SCHEMAV_DOCUMENT_ELEMENT_MISSING               =    1872
+        XML_SCHEMAV_CVC_COMPLEX_TYPE_1                     =    1873
+        XML_SCHEMAV_CVC_AU                                 =    1874
+        XML_SCHEMAV_CVC_TYPE_1                             =    1875
+        XML_SCHEMAV_CVC_TYPE_2                             =    1876
+        XML_SCHEMAV_CVC_IDC                                =    1877
+        XML_SCHEMAV_CVC_WILDCARD                           =    1878
+        XML_SCHEMAV_MISC                                   =    1879
+        XML_XPTR_UNKNOWN_SCHEME                            =    1900
+        XML_XPTR_CHILDSEQ_START                            =    1901
+        XML_XPTR_EVAL_FAILED                               =    1902
+        XML_XPTR_EXTRA_OBJECTS                             =    1903
+        XML_C14N_CREATE_CTXT                               =    1950
+        XML_C14N_REQUIRES_UTF8                             =    1951
+        XML_C14N_CREATE_STACK                              =    1952
+        XML_C14N_INVALID_NODE                              =    1953
+        XML_C14N_UNKNOW_NODE                               =    1954
+        XML_C14N_RELATIVE_NAMESPACE                        =    1955
+        XML_FTP_PASV_ANSWER                                =    2000
+        XML_FTP_EPSV_ANSWER                                =    2001
+        XML_FTP_ACCNT                                      =    2002
+        XML_FTP_URL_SYNTAX                                 =    2003
+        XML_HTTP_URL_SYNTAX                                =    2020
+        XML_HTTP_USE_IP                                    =    2021
+        XML_HTTP_UNKNOWN_HOST                              =    2022
+        XML_SCHEMAP_SRC_SIMPLE_TYPE_1                      =    3000
+        XML_SCHEMAP_SRC_SIMPLE_TYPE_2                      =    3001
+        XML_SCHEMAP_SRC_SIMPLE_TYPE_3                      =    3002
+        XML_SCHEMAP_SRC_SIMPLE_TYPE_4                      =    3003
+        XML_SCHEMAP_SRC_RESOLVE                            =    3004
+        XML_SCHEMAP_SRC_RESTRICTION_BASE_OR_SIMPLETYPE     =    3005
+        XML_SCHEMAP_SRC_LIST_ITEMTYPE_OR_SIMPLETYPE        =    3006
+        XML_SCHEMAP_SRC_UNION_MEMBERTYPES_OR_SIMPLETYPES   =    3007
+        XML_SCHEMAP_ST_PROPS_CORRECT_1                     =    3008
+        XML_SCHEMAP_ST_PROPS_CORRECT_2                     =    3009
+        XML_SCHEMAP_ST_PROPS_CORRECT_3                     =    3010
+        XML_SCHEMAP_COS_ST_RESTRICTS_1_1                   =    3011
+        XML_SCHEMAP_COS_ST_RESTRICTS_1_2                   =    3012
+        XML_SCHEMAP_COS_ST_RESTRICTS_1_3_1                 =    3013
+        XML_SCHEMAP_COS_ST_RESTRICTS_1_3_2                 =    3014
+        XML_SCHEMAP_COS_ST_RESTRICTS_2_1                   =    3015
+        XML_SCHEMAP_COS_ST_RESTRICTS_2_3_1_1               =    3016
+        XML_SCHEMAP_COS_ST_RESTRICTS_2_3_1_2               =    3017
+        XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_1               =    3018
+        XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_2               =    3019
+        XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_3               =    3020
+        XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_4               =    3021
+        XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_5               =    3022
+        XML_SCHEMAP_COS_ST_RESTRICTS_3_1                   =    3023
+        XML_SCHEMAP_COS_ST_RESTRICTS_3_3_1                 =    3024
+        XML_SCHEMAP_COS_ST_RESTRICTS_3_3_1_2               =    3025
+        XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_2               =    3026
+        XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_1               =    3027
+        XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_3               =    3028
+        XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_4               =    3029
+        XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_5               =    3030
+        XML_SCHEMAP_COS_ST_DERIVED_OK_2_1                  =    3031
+        XML_SCHEMAP_COS_ST_DERIVED_OK_2_2                  =    3032
+        XML_SCHEMAP_S4S_ELEM_NOT_ALLOWED                   =    3033
+        XML_SCHEMAP_S4S_ELEM_MISSING                       =    3034
+        XML_SCHEMAP_S4S_ATTR_NOT_ALLOWED                   =    3035
+        XML_SCHEMAP_S4S_ATTR_MISSING                       =    3036
+        XML_SCHEMAP_S4S_ATTR_INVALID_VALUE                 =    3037
+        XML_SCHEMAP_SRC_ELEMENT_1                          =    3038
+        XML_SCHEMAP_SRC_ELEMENT_2_1                        =    3039
+        XML_SCHEMAP_SRC_ELEMENT_2_2                        =    3040
+        XML_SCHEMAP_SRC_ELEMENT_3                          =    3041
+        XML_SCHEMAP_P_PROPS_CORRECT_1                      =    3042
+        XML_SCHEMAP_P_PROPS_CORRECT_2_1                    =    3043
+        XML_SCHEMAP_P_PROPS_CORRECT_2_2                    =    3044
+        XML_SCHEMAP_E_PROPS_CORRECT_2                      =    3045
+        XML_SCHEMAP_E_PROPS_CORRECT_3                      =    3046
+        XML_SCHEMAP_E_PROPS_CORRECT_4                      =    3047
+        XML_SCHEMAP_E_PROPS_CORRECT_5                      =    3048
+        XML_SCHEMAP_E_PROPS_CORRECT_6                      =    3049
+        XML_SCHEMAP_SRC_INCLUDE                            =    3050
+        XML_SCHEMAP_SRC_ATTRIBUTE_1                        =    3051
+        XML_SCHEMAP_SRC_ATTRIBUTE_2                        =    3052
+        XML_SCHEMAP_SRC_ATTRIBUTE_3_1                      =    3053
+        XML_SCHEMAP_SRC_ATTRIBUTE_3_2                      =    3054
+        XML_SCHEMAP_SRC_ATTRIBUTE_4                        =    3055
+        XML_SCHEMAP_NO_XMLNS                               =    3056
+        XML_SCHEMAP_NO_XSI                                 =    3057
+        XML_SCHEMAP_COS_VALID_DEFAULT_1                    =    3058
+        XML_SCHEMAP_COS_VALID_DEFAULT_2_1                  =    3059
+        XML_SCHEMAP_COS_VALID_DEFAULT_2_2_1                =    3060
+        XML_SCHEMAP_COS_VALID_DEFAULT_2_2_2                =    3061
+        XML_SCHEMAP_CVC_SIMPLE_TYPE                        =    3062
+        XML_SCHEMAP_COS_CT_EXTENDS_1_1                     =    3063
+        XML_SCHEMAP_SRC_IMPORT_1_1                         =    3064
+        XML_SCHEMAP_SRC_IMPORT_1_2                         =    3065
+        XML_SCHEMAP_SRC_IMPORT_2                           =    3066
+        XML_SCHEMAP_SRC_IMPORT_2_1                         =    3067
+        XML_SCHEMAP_SRC_IMPORT_2_2                         =    3068
+        XML_SCHEMAP_INTERNAL                               =    3069
+        XML_SCHEMAP_NOT_DETERMINISTIC                      =    3070
+        XML_SCHEMAP_SRC_ATTRIBUTE_GROUP_1                  =    3071
+        XML_SCHEMAP_SRC_ATTRIBUTE_GROUP_2                  =    3072
+        XML_SCHEMAP_SRC_ATTRIBUTE_GROUP_3                  =    3073
+        XML_SCHEMAP_MG_PROPS_CORRECT_1                     =    3074
+        XML_SCHEMAP_MG_PROPS_CORRECT_2                     =    3075
+        XML_SCHEMAP_SRC_CT_1                               =    3076
+        XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_3        =    3077
+        XML_SCHEMAP_AU_PROPS_CORRECT_2                     =    3078
+        XML_SCHEMAP_A_PROPS_CORRECT_2                      =    3079
+        XML_SCHEMAP_C_PROPS_CORRECT                        =    3080
+        XML_SCHEMAP_SRC_REDEFINE                           =    3081
+        XML_SCHEMAP_SRC_IMPORT                             =    3082
+        XML_SCHEMAP_WARN_SKIP_SCHEMA                       =    3083
+        XML_SCHEMAP_WARN_UNLOCATED_SCHEMA                  =    3084
+        XML_SCHEMAP_WARN_ATTR_REDECL_PROH                  =    3085
+        XML_SCHEMAP_WARN_ATTR_POINTLESS_PROH               =    3086
+        XML_SCHEMAP_AG_PROPS_CORRECT                       =    3087
+        XML_SCHEMAP_COS_CT_EXTENDS_1_2                     =    3088
+        XML_SCHEMAP_AU_PROPS_CORRECT                       =    3089
+        XML_SCHEMAP_A_PROPS_CORRECT_3                      =    3090
+        XML_SCHEMAP_COS_ALL_LIMITED                        =    3091
+        XML_SCHEMATRONV_ASSERT                             =    4000
+        XML_SCHEMATRONV_REPORT                             =    4001
+        XML_MODULE_OPEN                                    =    4900
+        XML_MODULE_CLOSE                                   =    4901
+        XML_CHECK_FOUND_ELEMENT                            =    5000
+        XML_CHECK_FOUND_ATTRIBUTE                          =    5001
+        XML_CHECK_FOUND_TEXT                               =    5002
+        XML_CHECK_FOUND_CDATA                              =    5003
+        XML_CHECK_FOUND_ENTITYREF                          =    5004
+        XML_CHECK_FOUND_ENTITY                             =    5005
+        XML_CHECK_FOUND_PI                                 =    5006
+        XML_CHECK_FOUND_COMMENT                            =    5007
+        XML_CHECK_FOUND_DOCTYPE                            =    5008
+        XML_CHECK_FOUND_FRAGMENT                           =    5009
+        XML_CHECK_FOUND_NOTATION                           =    5010
+        XML_CHECK_UNKNOWN_NODE                             =    5011
+        XML_CHECK_ENTITY_TYPE                              =    5012
+        XML_CHECK_NO_PARENT                                =    5013
+        XML_CHECK_NO_DOC                                   =    5014
+        XML_CHECK_NO_NAME                                  =    5015
+        XML_CHECK_NO_ELEM                                  =    5016
+        XML_CHECK_WRONG_DOC                                =    5017
+        XML_CHECK_NO_PREV                                  =    5018
+        XML_CHECK_WRONG_PREV                               =    5019
+        XML_CHECK_NO_NEXT                                  =    5020
+        XML_CHECK_WRONG_NEXT                               =    5021
+        XML_CHECK_NOT_DTD                                  =    5022
+        XML_CHECK_NOT_ATTR                                 =    5023
+        XML_CHECK_NOT_ATTR_DECL                            =    5024
+        XML_CHECK_NOT_ELEM_DECL                            =    5025
+        XML_CHECK_NOT_ENTITY_DECL                          =    5026
+        XML_CHECK_NOT_NS_DECL                              =    5027
+        XML_CHECK_NO_HREF                                  =    5028
+        XML_CHECK_WRONG_PARENT                             =    5029
+        XML_CHECK_NS_SCOPE                                 =    5030
+        XML_CHECK_NS_ANCESTOR                              =    5031
+        XML_CHECK_NOT_UTF8                                 =    5032
+        XML_CHECK_NO_DICT                                  =    5033
+        XML_CHECK_NOT_NCNAME                               =    5034
+        XML_CHECK_OUTSIDE_DICT                             =    5035
+        XML_CHECK_WRONG_NAME                               =    5036
+        XML_CHECK_NAME_NOT_NULL                            =    5037
+        XML_I18N_NO_NAME                                   =    6000
+        XML_I18N_NO_HANDLER                                =    6001
+        XML_I18N_EXCESS_HANDLER                            =    6002
+        XML_I18N_CONV_FAILED                               =    6003
+        XML_I18N_NO_OUTPUT                                 =    6004
+        XML_BUF_OVERFLOW                                   =    7000
+
+    ctypedef enum xmlRelaxNGValidErr:
+        XML_RELAXNG_OK                                     =       0
+        XML_RELAXNG_ERR_MEMORY                             =       1
+        XML_RELAXNG_ERR_TYPE                               =       2
+        XML_RELAXNG_ERR_TYPEVAL                            =       3
+        XML_RELAXNG_ERR_DUPID                              =       4
+        XML_RELAXNG_ERR_TYPECMP                            =       5
+        XML_RELAXNG_ERR_NOSTATE                            =       6
+        XML_RELAXNG_ERR_NODEFINE                           =       7
+        XML_RELAXNG_ERR_LISTEXTRA                          =       8
+        XML_RELAXNG_ERR_LISTEMPTY                          =       9
+        XML_RELAXNG_ERR_INTERNODATA                        =      10
+        XML_RELAXNG_ERR_INTERSEQ                           =      11
+        XML_RELAXNG_ERR_INTEREXTRA                         =      12
+        XML_RELAXNG_ERR_ELEMNAME                           =      13
+        XML_RELAXNG_ERR_ATTRNAME                           =      14
+        XML_RELAXNG_ERR_ELEMNONS                           =      15
+        XML_RELAXNG_ERR_ATTRNONS                           =      16
+        XML_RELAXNG_ERR_ELEMWRONGNS                        =      17
+        XML_RELAXNG_ERR_ATTRWRONGNS                        =      18
+        XML_RELAXNG_ERR_ELEMEXTRANS                        =      19
+        XML_RELAXNG_ERR_ATTREXTRANS                        =      20
+        XML_RELAXNG_ERR_ELEMNOTEMPTY                       =      21
+        XML_RELAXNG_ERR_NOELEM                             =      22
+        XML_RELAXNG_ERR_NOTELEM                            =      23
+        XML_RELAXNG_ERR_ATTRVALID                          =      24
+        XML_RELAXNG_ERR_CONTENTVALID                       =      25
+        XML_RELAXNG_ERR_EXTRACONTENT                       =      26
+        XML_RELAXNG_ERR_INVALIDATTR                        =      27
+        XML_RELAXNG_ERR_DATAELEM                           =      28
+        XML_RELAXNG_ERR_VALELEM                            =      29
+        XML_RELAXNG_ERR_LISTELEM                           =      30
+        XML_RELAXNG_ERR_DATATYPE                           =      31
+        XML_RELAXNG_ERR_VALUE                              =      32
+        XML_RELAXNG_ERR_LIST                               =      33
+        XML_RELAXNG_ERR_NOGRAMMAR                          =      34
+        XML_RELAXNG_ERR_EXTRADATA                          =      35
+        XML_RELAXNG_ERR_LACKDATA                           =      36
+        XML_RELAXNG_ERR_INTERNAL                           =      37
+        XML_RELAXNG_ERR_ELEMWRONG                          =      38
+        XML_RELAXNG_ERR_TEXTWRONG                          =      39
+# --- END: GENERATED CONSTANTS ---
+
+cdef extern from "libxml/xmlerror.h" nogil:
+    ctypedef struct xmlError:
+        int domain
+        int code
+        char* message
+        xmlErrorLevel level
+        char* file
+        char* str1
+        char* str2
+        char* str3
+        int line
+        int int1
+        int int2
+        void* node
+
+    ctypedef void (*xmlGenericErrorFunc)(void* ctxt, char* msg, ...) noexcept
+    ctypedef void (*xmlStructuredErrorFunc)(void* userData,
+                                            const xmlError* error) noexcept
+
+    cdef void xmlSetGenericErrorFunc(
+        void* ctxt, xmlGenericErrorFunc func)
+    cdef void xmlSetStructuredErrorFunc(
+        void* ctxt, xmlStructuredErrorFunc func)
+
+cdef extern from "libxml/globals.h" nogil:
+    cdef xmlStructuredErrorFunc xmlStructuredError
+    cdef void* xmlStructuredErrorContext
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlparser.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlparser.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..e0ef221af7e14f7fa2bb4c7ed8710a5b0524c5d0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlparser.pxd
@@ -0,0 +1,318 @@
+from libc.string cimport const_char
+
+from lxml.includes.tree cimport (
+    xmlDoc, xmlNode, xmlEntity, xmlDict, xmlDtd, xmlChar, const_xmlChar)
+from lxml.includes.tree cimport xmlInputReadCallback, xmlInputCloseCallback
+from lxml.includes.xmlerror cimport xmlError, xmlStructuredErrorFunc, xmlErrorLevel
+
+
+cdef extern from "libxml/parser.h" nogil:
+    ctypedef void (*startElementNsSAX2Func)(void* ctx,
+                                            const_xmlChar* localname,
+                                            const_xmlChar* prefix,
+                                            const_xmlChar* URI,
+                                            int nb_namespaces,
+                                            const_xmlChar** namespaces,
+                                            int nb_attributes,
+                                            int nb_defaulted,
+                                            const_xmlChar** attributes) noexcept
+
+    ctypedef void (*endElementNsSAX2Func)(void* ctx,
+                                          const_xmlChar* localname,
+                                          const_xmlChar* prefix,
+                                          const_xmlChar* URI) noexcept
+
+    ctypedef void (*startElementSAXFunc)(void* ctx, const_xmlChar* name, const_xmlChar** atts) noexcept
+
+    ctypedef void (*endElementSAXFunc)(void* ctx, const_xmlChar* name) noexcept
+
+    ctypedef void (*charactersSAXFunc)(void* ctx, const_xmlChar* ch, int len) noexcept
+
+    ctypedef void (*cdataBlockSAXFunc)(void* ctx, const_xmlChar* value, int len) noexcept
+
+    ctypedef void (*commentSAXFunc)(void* ctx, const_xmlChar* value) noexcept
+
+    ctypedef void (*processingInstructionSAXFunc)(void* ctx,
+                                                  const_xmlChar* target,
+                                                  const_xmlChar* data) noexcept
+
+    ctypedef void (*internalSubsetSAXFunc)(void* ctx,
+                                            const_xmlChar* name,
+                                            const_xmlChar* externalID,
+                                            const_xmlChar* systemID) noexcept
+
+    ctypedef void (*endDocumentSAXFunc)(void* ctx) noexcept
+
+    ctypedef void (*startDocumentSAXFunc)(void* ctx) noexcept
+
+    ctypedef void (*referenceSAXFunc)(void * ctx, const_xmlChar* name) noexcept
+
+    ctypedef xmlEntity* (*getEntitySAXFunc)(void* ctx, const_xmlChar* name) noexcept
+
+    cdef int XML_SAX2_MAGIC
+
+cdef extern from "libxml/tree.h" nogil:
+    ctypedef struct xmlParserInput:
+        int line
+        int col
+        int length
+        const_xmlChar* base
+        const_xmlChar* cur
+        const_xmlChar* end
+        const_char *filename
+
+    ctypedef struct xmlParserInputBuffer:
+        void* context
+        xmlInputReadCallback  readcallback
+        xmlInputCloseCallback closecallback
+
+    ctypedef struct xmlSAXHandlerV1:
+        # same as xmlSAXHandler, but without namespaces
+        pass
+
+    ctypedef struct xmlSAXHandler:
+        internalSubsetSAXFunc           internalSubset
+        startElementNsSAX2Func          startElementNs
+        endElementNsSAX2Func            endElementNs
+        startElementSAXFunc             startElement
+        endElementSAXFunc               endElement
+        charactersSAXFunc               characters
+        cdataBlockSAXFunc               cdataBlock
+        referenceSAXFunc                reference
+        getEntitySAXFunc                getEntity
+        commentSAXFunc                  comment
+        processingInstructionSAXFunc	processingInstruction
+        startDocumentSAXFunc            startDocument
+        endDocumentSAXFunc              endDocument
+        int                             initialized
+        xmlStructuredErrorFunc          serror
+        void*                           _private
+
+
+cdef extern from "libxml/SAX2.h" nogil:
+    cdef void xmlSAX2StartDocument(void* ctxt)
+
+
+cdef extern from "libxml/xmlIO.h" nogil:
+    cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc)
+
+
+cdef extern from "libxml/parser.h" nogil:
+
+    ctypedef enum xmlFeature:
+        XML_WITH_THREAD = 1
+        XML_WITH_TREE = 2
+        XML_WITH_OUTPUT = 3
+        XML_WITH_PUSH = 4
+        XML_WITH_READER = 5
+        XML_WITH_PATTERN = 6
+        XML_WITH_WRITER = 7
+        XML_WITH_SAX1 = 8
+        XML_WITH_FTP = 9
+        XML_WITH_HTTP = 10
+        XML_WITH_VALID = 11
+        XML_WITH_HTML = 12
+        XML_WITH_LEGACY = 13
+        XML_WITH_C14N = 14
+        XML_WITH_CATALOG = 15
+        XML_WITH_XPATH = 16
+        XML_WITH_XPTR = 17
+        XML_WITH_XINCLUDE = 18
+        XML_WITH_ICONV = 19
+        XML_WITH_ISO8859X = 20
+        XML_WITH_UNICODE = 21
+        XML_WITH_REGEXP = 22
+        XML_WITH_AUTOMATA = 23
+        XML_WITH_EXPR = 24
+        XML_WITH_SCHEMAS = 25
+        XML_WITH_SCHEMATRON = 26
+        XML_WITH_MODULES = 27
+        XML_WITH_DEBUG = 28
+        XML_WITH_DEBUG_MEM = 29
+        XML_WITH_DEBUG_RUN = 30
+        XML_WITH_ZLIB = 31
+        XML_WITH_ICU = 32
+        XML_WITH_LZMA = 33
+
+    cdef bint xmlHasFeature(xmlFeature feature)
+
+    cdef xmlDict* xmlDictCreate()
+    cdef xmlDict* xmlDictCreateSub(xmlDict* subdict)
+    cdef void xmlDictFree(xmlDict* sub)
+    cdef int xmlDictReference(xmlDict* dict)
+
+    cdef int XML_COMPLETE_ATTRS  # SAX option for adding DTD default attributes
+    cdef int XML_SKIP_IDS        # SAX option for not building an XML ID dict
+
+    ctypedef enum xmlParserInputState:
+        XML_PARSER_EOF = -1  # nothing is to be parsed
+        XML_PARSER_START = 0  # nothing has been parsed
+        XML_PARSER_MISC = 1  # Misc* before int subset
+        XML_PARSER_PI = 2  # Within a processing instruction
+        XML_PARSER_DTD = 3  # within some DTD content
+        XML_PARSER_PROLOG = 4  # Misc* after internal subset
+        XML_PARSER_COMMENT = 5  # within a comment
+        XML_PARSER_START_TAG = 6  # within a start tag
+        XML_PARSER_CONTENT = 7  # within the content
+        XML_PARSER_CDATA_SECTION = 8  # within a CDATA section
+        XML_PARSER_END_TAG = 9  # within a closing tag
+        XML_PARSER_ENTITY_DECL = 10  # within an entity declaration
+        XML_PARSER_ENTITY_VALUE = 11  # within an entity value in a decl
+        XML_PARSER_ATTRIBUTE_VALUE = 12  # within an attribute value
+        XML_PARSER_SYSTEM_LITERAL = 13  # within a SYSTEM value
+        XML_PARSER_EPILOG = 14  # the Misc* after the last end tag
+        XML_PARSER_IGNORE = 15  # within an IGNORED section
+        XML_PARSER_PUBLIC_LITERAL = 16  # within a PUBLIC value
+
+
+    ctypedef struct xmlParserCtxt:
+        xmlDoc* myDoc
+        xmlDict* dict
+        int dictNames
+        void* _private
+        bint wellFormed
+        bint recovery
+        int options
+        bint disableSAX
+        int errNo
+        xmlParserInputState instate
+        bint replaceEntities
+        int loadsubset  # != 0 if enabled, int value == why
+        bint validate
+        xmlError lastError
+        xmlNode* node
+        xmlSAXHandler* sax
+        void* userData
+        int* spaceTab
+        int spaceMax
+        int nsNr
+        bint html
+        bint progressive
+        int inSubset
+        int charset
+        xmlParserInput* input
+        int inputNr
+        xmlParserInput* inputTab[]
+
+    ctypedef enum xmlParserOption:
+        XML_PARSE_RECOVER = 0x1                   # recover on errors
+        XML_PARSE_NOENT = 0x2                     # substitute entities
+        XML_PARSE_DTDLOAD = 0x4                   # load the external subset
+        XML_PARSE_DTDATTR = 0x8                   # default DTD attributes
+        XML_PARSE_DTDVALID = 0x10                 # validate with the DTD
+        XML_PARSE_NOERROR = 0x20                  # suppress error reports
+        XML_PARSE_NOWARNING = 0x40                # suppress warning reports
+        XML_PARSE_PEDANTIC = 0x80                 # pedantic error reporting
+        XML_PARSE_NOBLANKS = 0x100                # remove blank nodes
+        XML_PARSE_SAX1 = 0x200                    # use the SAX1 interface internally
+        XML_PARSE_XINCLUDE = 0x400                # Implement XInclude substitution
+        XML_PARSE_NONET = 0x800                   # Forbid network access
+        XML_PARSE_NODICT = 0x1000                 # Do not reuse the context dictionary
+        XML_PARSE_NSCLEAN = 0x2000                # remove redundant namespaces declarations
+        XML_PARSE_NOCDATA = 0x4000                # merge CDATA as text nodes
+        XML_PARSE_NOXINCNODE = 0x8000             # do not generate XINCLUDE START/END nodes
+        # libxml2 2.6.21+ only:
+        XML_PARSE_COMPACT = 0x1_0000              # compact small text nodes
+        # libxml2 2.7.0+ only:
+        XML_PARSE_OLD10 = 0x2_0000                # parse using XML-1.0 before update 5
+        XML_PARSE_NOBASEFIX = 0x4_0000            # do not fixup XINCLUDE xml:base uris
+        XML_PARSE_HUGE = 0x8_0000                 # relax any hardcoded limit from the parser
+        # libxml2 2.7.3+ only:
+        XML_PARSE_OLDSAX = 0x10_0000              # parse using SAX2 interface before 2.7.0
+        # libxml2 2.8.0+ only:
+        XML_PARSE_IGNORE_ENC = 0x20_0000          # ignore internal document encoding hint
+        # libxml2 2.9.0+ only:
+        XML_PARSE_BIG_LINES = 0x40_0000           # Store big lines numbers in text PSVI field
+        # libxml2 2.13.0+ only:
+        XML_PARSE_NO_XXE = 0x80_0000              # Disable loading of external DTDs or entities
+        # libxml2 2.14.0+ only:
+        XML_PARSE_UNZIP = 0x100_0000              # Enable input decompression (and potential gzip bombs)
+        XML_PARSE_NO_SYS_CATALOG = 0x200_0000     # Disable the global system XML catalog
+        XML_PARSE_CATALOG_PI = 0x400_0000         # Enable XML catalog processing instructions
+        # libxml2 2.15.0+ only:
+        XML_PARSE_SKIP_IDS = 0x800_0000           # Force the parser to ignore IDs
+
+    cdef void xmlInitParser()
+    cdef void xmlCleanupParser()
+
+    cdef int xmlLineNumbersDefault(int onoff)
+    cdef xmlParserCtxt* xmlNewParserCtxt()
+    cdef xmlParserInput* xmlNewIOInputStream(xmlParserCtxt* ctxt,
+                                             xmlParserInputBuffer* input,
+                                             int enc)
+    cdef int xmlCtxtUseOptions(xmlParserCtxt* ctxt, int options)
+    cdef void xmlFreeParserCtxt(xmlParserCtxt* ctxt)
+    cdef void xmlCtxtReset(xmlParserCtxt* ctxt)
+    cdef void xmlClearParserCtxt(xmlParserCtxt* ctxt)
+    cdef int xmlParseChunk(xmlParserCtxt* ctxt,
+                           char* chunk, int size, int terminate)
+    cdef xmlDoc* xmlCtxtReadDoc(xmlParserCtxt* ctxt,
+                                char* cur, char* URL, char* encoding,
+                                int options)
+    cdef xmlDoc* xmlCtxtReadFile(xmlParserCtxt* ctxt,
+                                 char* filename, char* encoding,
+                                 int options)
+    cdef xmlDoc* xmlCtxtReadIO(xmlParserCtxt* ctxt,
+                               xmlInputReadCallback ioread,
+                               xmlInputCloseCallback ioclose,
+                               void* ioctx,
+                               char* URL, char* encoding,
+                               int options)
+    cdef xmlDoc* xmlCtxtReadMemory(xmlParserCtxt* ctxt,
+                                   char* buffer, int size,
+                                   char* filename, const_char* encoding,
+                                   int options)
+
+    cdef void xmlErrParser(xmlParserCtxt* ctxt, xmlNode* node,
+                           int domain, int code, xmlErrorLevel level,
+                           const xmlChar *str1, const xmlChar *str2, const xmlChar *str3,
+                           int int1, const char *msg, ...)
+
+
+# iterparse:
+
+    cdef xmlParserCtxt* xmlCreatePushParserCtxt(xmlSAXHandler* sax,
+                                                void* user_data,
+                                                char* chunk,
+                                                int size,
+                                                char* filename)
+
+    cdef int xmlCtxtResetPush(xmlParserCtxt* ctxt,
+                              char* chunk,
+                              int size,
+                              char* filename,
+                              char* encoding)
+
+# entity loaders:
+
+    ctypedef xmlParserInput* (*xmlExternalEntityLoader)(
+        const_char * URL, const_char * ID, xmlParserCtxt* context) noexcept
+    cdef xmlExternalEntityLoader xmlGetExternalEntityLoader()
+    cdef void xmlSetExternalEntityLoader(xmlExternalEntityLoader f)
+
+    cdef xmlEntity* xmlSAX2GetEntity(void* ctxt, const_xmlChar* name) noexcept
+
+# DTDs:
+
+    cdef xmlDtd* xmlParseDTD(const_xmlChar* ExternalID, const_xmlChar* SystemID)
+    cdef xmlDtd* xmlIOParseDTD(xmlSAXHandler* sax,
+                               xmlParserInputBuffer* input,
+                               int enc)
+
+
+cdef extern from "libxml/parserInternals.h" nogil:
+    """
+    #if LIBXML_VERSION < 21400
+    #define xmlNewInputFromMemory(url, mem, size, flags)  (NULL)
+    #endif
+    """
+    cdef xmlParserInput* xmlNewInputStream(xmlParserCtxt* ctxt)
+    cdef xmlParserInput* xmlNewStringInputStream(xmlParserCtxt* ctxt,
+                                                 char* buffer)
+    cdef xmlParserInput* xmlNewInputFromFile(xmlParserCtxt* ctxt,
+                                             char* filename)
+    cdef xmlParserInput* xmlNewInputFromMemory(
+        const char *url, const void *mem, size_t size, int flags)  # actually "xmlParserInputFlags flags"
+    cdef void xmlFreeInputStream(xmlParserInput* input)
+    cdef int xmlSwitchEncoding(xmlParserCtxt* ctxt, int enc)
+    cdef bint xmlCtxtIsStopped(xmlParserCtxt* ctxt)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlschema.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlschema.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..0674111132ee3c2da8f2a3ec1db7d9dbb462ad30
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xmlschema.pxd
@@ -0,0 +1,35 @@
+from lxml.includes.tree cimport xmlDoc
+from lxml.includes.xmlparser cimport xmlSAXHandler
+from lxml.includes.xmlerror cimport xmlStructuredErrorFunc
+
+cdef extern from "libxml/xmlschemas.h" nogil:
+    ctypedef struct xmlSchema
+    ctypedef struct xmlSchemaParserCtxt
+
+    ctypedef struct xmlSchemaSAXPlugStruct
+    ctypedef struct xmlSchemaValidCtxt
+
+    ctypedef enum xmlSchemaValidOption:
+        XML_SCHEMA_VAL_VC_I_CREATE = 1
+
+    cdef xmlSchemaValidCtxt* xmlSchemaNewValidCtxt(xmlSchema* schema) nogil
+    cdef void xmlSchemaSetParserStructuredErrors(xmlSchemaParserCtxt* ctxt,
+        xmlStructuredErrorFunc serror, void *ctx)
+    cdef void xmlSchemaSetValidStructuredErrors(xmlSchemaValidCtxt* ctxt,
+        xmlStructuredErrorFunc serror, void *ctx)
+
+    cdef int xmlSchemaValidateDoc(xmlSchemaValidCtxt* ctxt, xmlDoc* doc) nogil
+    cdef xmlSchema* xmlSchemaParse(xmlSchemaParserCtxt* ctxt) nogil
+    cdef xmlSchemaParserCtxt* xmlSchemaNewParserCtxt(char* URL) nogil
+    cdef xmlSchemaParserCtxt* xmlSchemaNewDocParserCtxt(xmlDoc* doc) nogil
+    cdef void xmlSchemaFree(xmlSchema* schema) nogil
+    cdef void xmlSchemaFreeParserCtxt(xmlSchemaParserCtxt* ctxt) nogil
+    cdef void xmlSchemaFreeValidCtxt(xmlSchemaValidCtxt* ctxt) nogil
+    cdef int xmlSchemaSetValidOptions(xmlSchemaValidCtxt* ctxt,
+                                      int options) nogil
+
+    cdef xmlSchemaSAXPlugStruct* xmlSchemaSAXPlug(xmlSchemaValidCtxt* ctxt,
+                                                  xmlSAXHandler** sax,
+                                                  void** data) nogil
+    cdef int xmlSchemaSAXUnplug(xmlSchemaSAXPlugStruct* sax_plug)
+    cdef int xmlSchemaIsValid(xmlSchemaValidCtxt* ctxt)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xslt.pxd b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xslt.pxd
new file mode 100644
index 0000000000000000000000000000000000000000..abafe4325c9147e77590ff0f1647100bb5e9c56a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xslt.pxd
@@ -0,0 +1,190 @@
+from lxml.includes.tree cimport xmlDoc, xmlNode, xmlDict, xmlChar, const_xmlChar, xmlOutputBuffer
+from lxml.includes.xmlerror cimport xmlGenericErrorFunc
+from lxml.includes.xpath cimport xmlXPathContext, xmlXPathFunction
+
+from libc.string cimport const_char
+
+cdef extern from "libxslt/xslt.h":
+    cdef int xsltLibxsltVersion
+    cdef int xsltMaxDepth
+
+cdef extern from "libxslt/xsltconfig.h":
+    cdef int LIBXSLT_VERSION
+
+cdef extern from "libxslt/xsltInternals.h" nogil:
+    ctypedef enum xsltTransformState:
+        XSLT_STATE_OK       # 0
+        XSLT_STATE_ERROR    # 1
+        XSLT_STATE_STOPPED  # 2
+
+    ctypedef struct xsltDocument:
+        xmlDoc* doc
+
+    ctypedef struct xsltStylesheet:
+        xmlChar* encoding
+        xmlDoc* doc
+        int errors
+
+    ctypedef struct xsltTransformContext:
+        xsltStylesheet* style
+        xmlXPathContext* xpathCtxt
+        xsltDocument* document
+        void* _private
+        xmlDict* dict
+        int profile
+        xmlNode* node
+        xmlDoc* output
+        xmlNode* insert
+        xmlNode* inst
+        xsltTransformState state
+
+    ctypedef struct xsltStackElem
+
+    ctypedef struct xsltTemplate
+
+    cdef xsltStylesheet* xsltParseStylesheetDoc(xmlDoc* doc)
+    cdef void xsltFreeStylesheet(xsltStylesheet* sheet)
+
+cdef extern from "libxslt/imports.h" nogil:
+    # actually defined in "etree_defs.h"
+    cdef void LXML_GET_XSLT_ENCODING(const_xmlChar* result_var, xsltStylesheet* style)
+
+cdef extern from "libxslt/extensions.h" nogil:
+    ctypedef void (*xsltTransformFunction)(xsltTransformContext* ctxt,
+                                           xmlNode* context_node,
+                                           xmlNode* inst,
+                                           void* precomp_unused) noexcept
+
+    cdef int xsltRegisterExtFunction(xsltTransformContext* ctxt,
+                                     const_xmlChar* name,
+                                     const_xmlChar* URI,
+                                     xmlXPathFunction function)
+    cdef int xsltRegisterExtModuleFunction(const_xmlChar* name, const_xmlChar* URI,
+                                           xmlXPathFunction function)
+    cdef int xsltUnregisterExtModuleFunction(const_xmlChar* name, const_xmlChar* URI)
+    cdef xmlXPathFunction xsltExtModuleFunctionLookup(
+        const_xmlChar* name, const_xmlChar* URI)
+    cdef int xsltRegisterExtPrefix(xsltStylesheet* style, 
+                                   const_xmlChar* prefix, const_xmlChar* URI)
+    cdef int xsltRegisterExtElement(xsltTransformContext* ctxt,
+                                    const_xmlChar* name, const_xmlChar* URI,
+                                    xsltTransformFunction function)
+
+cdef extern from "libxslt/documents.h" nogil:
+    ctypedef enum xsltLoadType:
+        XSLT_LOAD_START
+        XSLT_LOAD_STYLESHEET
+        XSLT_LOAD_DOCUMENT
+
+    ctypedef xmlDoc* (*xsltDocLoaderFunc)(const_xmlChar* URI, xmlDict* dict,
+                                          int options,
+                                          void* ctxt,
+                                          xsltLoadType type) noexcept
+    cdef xsltDocLoaderFunc xsltDocDefaultLoader
+    cdef void xsltSetLoaderFunc(xsltDocLoaderFunc f)
+
+cdef extern from "libxslt/transform.h" nogil:
+    cdef xmlDoc* xsltApplyStylesheet(xsltStylesheet* style, xmlDoc* doc,
+                                     const_char** params)
+    cdef xmlDoc* xsltApplyStylesheetUser(xsltStylesheet* style, xmlDoc* doc,
+                                         const_char** params, const_char* output,
+                                         void* profile,
+                                         xsltTransformContext* context)
+    cdef void xsltProcessOneNode(xsltTransformContext* ctxt,
+                                 xmlNode* contextNode,
+                                 xsltStackElem* params)
+    cdef xsltTransformContext* xsltNewTransformContext(xsltStylesheet* style,
+                                                       xmlDoc* doc)
+    cdef void xsltFreeTransformContext(xsltTransformContext* context)
+    cdef void xsltApplyOneTemplate(xsltTransformContext* ctxt,
+                                   xmlNode* contextNode, xmlNode* list,
+                                   xsltTemplate* templ,
+                                   xsltStackElem* params)
+
+
+cdef extern from "libxslt/xsltutils.h" nogil:
+    cdef int xsltSaveResultToString(xmlChar** doc_txt_ptr,
+                                    int* doc_txt_len,
+                                    xmlDoc* result,
+                                    xsltStylesheet* style)
+    cdef int xsltSaveResultToFilename(const_char *URL,
+                                      xmlDoc* result,
+                                      xsltStylesheet* style,
+                                      int compression)
+    cdef int xsltSaveResultTo(xmlOutputBuffer* buf,
+                              xmlDoc* result,
+                              xsltStylesheet* style)
+    cdef xmlGenericErrorFunc xsltGenericError
+    cdef void *xsltGenericErrorContext
+    cdef void xsltSetGenericErrorFunc(
+        void* ctxt, void (*handler)(void* ctxt, char* msg, ...) nogil)
+    cdef void xsltSetTransformErrorFunc(
+        xsltTransformContext*, void* ctxt,
+        void (*handler)(void* ctxt, char* msg, ...) nogil)
+    cdef void xsltTransformError(xsltTransformContext* ctxt, 
+                                 xsltStylesheet* style, 
+                                 xmlNode* node, char* msg, ...)
+    cdef void xsltSetCtxtParseOptions(
+        xsltTransformContext* ctxt, int options)
+
+
+cdef extern from "libxslt/security.h" nogil:
+    ctypedef struct xsltSecurityPrefs
+    ctypedef enum xsltSecurityOption:
+        XSLT_SECPREF_READ_FILE = 1
+        XSLT_SECPREF_WRITE_FILE = 2
+        XSLT_SECPREF_CREATE_DIRECTORY = 3
+        XSLT_SECPREF_READ_NETWORK = 4
+        XSLT_SECPREF_WRITE_NETWORK = 5
+
+    ctypedef int (*xsltSecurityCheck)(xsltSecurityPrefs* sec,
+                                      xsltTransformContext* ctxt,
+                                      char* value) noexcept
+
+    cdef xsltSecurityPrefs* xsltNewSecurityPrefs()
+    cdef void xsltFreeSecurityPrefs(xsltSecurityPrefs* sec)
+    cdef int xsltSecurityForbid(xsltSecurityPrefs* sec,
+                                xsltTransformContext* ctxt,
+                                char* value)
+    cdef int xsltSecurityAllow(xsltSecurityPrefs* sec,
+                                xsltTransformContext* ctxt,
+                                char* value)
+    cdef int xsltSetSecurityPrefs(xsltSecurityPrefs* sec,
+                                  xsltSecurityOption option,
+                                  xsltSecurityCheck func)
+    cdef xsltSecurityCheck xsltGetSecurityPrefs(
+        xsltSecurityPrefs* sec,
+        xsltSecurityOption option)
+    cdef int xsltSetCtxtSecurityPrefs(xsltSecurityPrefs* sec,
+                                      xsltTransformContext* ctxt)
+    cdef xmlDoc* xsltGetProfileInformation(xsltTransformContext* ctxt)
+
+cdef extern from "libxslt/variables.h" nogil:
+    cdef int xsltQuoteUserParams(xsltTransformContext* ctxt,
+                                 const_char** params)
+    cdef int xsltQuoteOneUserParam(xsltTransformContext* ctxt,
+                                   const_xmlChar* name,
+                                   const_xmlChar* value)
+
+cdef extern from "libxslt/extra.h" nogil:
+    const_xmlChar* XSLT_LIBXSLT_NAMESPACE
+    const_xmlChar* XSLT_XALAN_NAMESPACE
+    const_xmlChar* XSLT_SAXON_NAMESPACE
+    const_xmlChar* XSLT_XT_NAMESPACE
+
+    cdef xmlXPathFunction xsltFunctionNodeSet
+    cdef void xsltRegisterAllExtras()
+
+cdef extern from "libexslt/exslt.h" nogil:
+    cdef void exsltRegisterAll()
+
+    # libexslt 1.1.25+
+    const_xmlChar* EXSLT_DATE_NAMESPACE
+    const_xmlChar* EXSLT_SETS_NAMESPACE
+    const_xmlChar* EXSLT_MATH_NAMESPACE
+    const_xmlChar* EXSLT_STRINGS_NAMESPACE
+
+    cdef int exsltDateXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix)
+    cdef int exsltSetsXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix)
+    cdef int exsltMathXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix)
+    cdef int exsltStrXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc9e2c299647192bf5ee650105ff46426b3e4076
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/rng/iso-schematron.rng b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/rng/iso-schematron.rng
new file mode 100644
index 0000000000000000000000000000000000000000..a4f504af1f7d6f01f7523d447b9304f417c01800
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/rng/iso-schematron.rng
@@ -0,0 +1,709 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Copyright © ISO/IEC 2015 -->
+<!--
+  The following permission notice and disclaimer shall be included in all
+  copies of this XML schema ("the Schema"), and derivations of the Schema:
+  
+  Permission is hereby granted, free of charge in perpetuity, to any
+  person obtaining a copy of the Schema, to use, copy, modify, merge and
+  distribute free of charge, copies of the Schema for the purposes of
+  developing, implementing, installing and using software based on the
+  Schema, and to permit persons to whom the Schema is furnished to do so,
+  subject to the following conditions:
+  
+  THE SCHEMA IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+  OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+  ARISING FROM, OUT OF OR IN CONNECTION WITH THE SCHEMA OR THE USE OR
+  OTHER DEALINGS IN THE SCHEMA.
+  
+  In addition, any modified copy of the Schema shall include the following
+  notice:
+  
+  "THIS SCHEMA HAS BEEN MODIFIED FROM THE SCHEMA DEFINED IN ISO/IEC 19757-3,
+  AND SHOULD NOT BE INTERPRETED AS COMPLYING WITH THAT STANDARD".
+-->
+<grammar ns="http://purl.oclc.org/dsdl/schematron" xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
+  <start>
+    <ref name="schema"/>
+  </start>
+  <!-- Element declarations -->
+  <define name="schema">
+    <element name="schema">
+      <optional>
+        <attribute name="id">
+          <data type="ID"/>
+        </attribute>
+      </optional>
+      <ref name="rich"/>
+      <optional>
+        <attribute name="schemaVersion">
+          <ref name="non-empty-string"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="defaultPhase">
+          <data type="IDREF"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="queryBinding">
+          <ref name="non-empty-string"/>
+        </attribute>
+      </optional>
+      <interleave>
+        <ref name="foreign"/>
+        <zeroOrMore>
+          <ref name="inclusion"/>
+        </zeroOrMore>
+        <group>
+          <optional>
+            <ref name="title"/>
+          </optional>
+          <zeroOrMore>
+            <ref name="ns"/>
+          </zeroOrMore>
+          <zeroOrMore>
+            <ref name="p"/>
+          </zeroOrMore>
+          <zeroOrMore>
+            <ref name="let"/>
+          </zeroOrMore>
+          <zeroOrMore>
+            <ref name="phase"/>
+          </zeroOrMore>
+          <oneOrMore>
+            <ref name="pattern"/>
+          </oneOrMore>
+          <zeroOrMore>
+            <ref name="p"/>
+          </zeroOrMore>
+          <optional>
+            <ref name="diagnostics"/>
+          </optional>
+          <optional>
+            <!-- edited (lxml): required in standard, optional here (since it can be empty anyway) -->
+            <ref name="properties"/>
+          </optional>
+        </group>
+      </interleave>
+    </element>
+  </define>
+  <define name="active">
+    <element name="active">
+      <attribute name="pattern">
+        <data type="IDREF"/>
+      </attribute>
+      <interleave>
+        <ref name="foreign"/>
+        <zeroOrMore>
+          <choice>
+            <text/>
+            <ref name="dir"/>
+            <ref name="emph"/>
+            <ref name="span"/>
+          </choice>
+        </zeroOrMore>
+      </interleave>
+    </element>
+  </define>
+  <define name="assert">
+    <element name="assert">
+      <attribute name="test">
+        <ref name="exprValue"/>
+      </attribute>
+      <optional>
+        <attribute name="flag">
+          <ref name="flagValue"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="id">
+          <data type="ID"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="diagnostics">
+          <data type="IDREFS"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="properties">
+          <data type="IDREFS"/>
+        </attribute>
+      </optional>
+      <ref name="rich"/>
+      <ref name="linkable"/>
+      <interleave>
+        <ref name="foreign"/>
+        <zeroOrMore>
+          <choice>
+            <text/>
+            <ref name="name"/>
+            <ref name="value-of"/>
+            <ref name="emph"/>
+            <ref name="dir"/>
+            <ref name="span"/>
+          </choice>
+        </zeroOrMore>
+      </interleave>
+    </element>
+  </define>
+  <define name="diagnostic">
+    <element name="diagnostic">
+      <attribute name="id">
+        <data type="ID"/>
+      </attribute>
+      <ref name="rich"/>
+      <interleave>
+        <ref name="foreign"/>
+        <zeroOrMore>
+          <choice>
+            <text/>
+            <ref name="value-of"/>
+            <ref name="emph"/>
+            <ref name="dir"/>
+            <ref name="span"/>
+          </choice>
+        </zeroOrMore>
+      </interleave>
+    </element>
+  </define>
+  <define name="diagnostics">
+    <element name="diagnostics">
+      <interleave>
+        <ref name="foreign"/>
+        <zeroOrMore>
+          <ref name="inclusion"/>
+        </zeroOrMore>
+        <zeroOrMore>
+          <ref name="diagnostic"/>
+        </zeroOrMore>
+      </interleave>
+    </element>
+  </define>
+  <define name="dir">
+    <element name="dir">
+      <optional>
+        <attribute name="value">
+          <choice>
+            <value>ltr</value>
+            <value>rtl</value>
+          </choice>
+        </attribute>
+      </optional>
+      <interleave>
+        <ref name="foreign"/>
+        <text/>
+      </interleave>
+    </element>
+  </define>
+  <define name="emph">
+    <element name="emph">
+      <text/>
+    </element>
+  </define>
+  <define name="extends">
+    <element name="extends">
+      <choice>
+        <attribute name="rule">
+          <data type="IDREF"/>
+        </attribute>
+        <attribute name="href">
+          <ref name="uriValue"/>
+        </attribute>
+      </choice>
+      <ref name="foreign-empty"/>
+    </element>
+  </define>
+  <define name="let">
+    <element name="let">
+      <attribute name="name">
+        <ref name="nameValue"/>
+      </attribute>
+      <choice>
+        <attribute name="value">
+          <data type="string" datatypeLibrary=""/>
+        </attribute>
+        <oneOrMore>
+          <ref name="foreign-element"/>
+        </oneOrMore>
+      </choice>
+    </element>
+  </define>
+  <define name="name">
+    <element name="name">
+      <optional>
+        <attribute name="path">
+          <ref name="pathValue"/>
+        </attribute>
+      </optional>
+      <ref name="foreign-empty"/>
+    </element>
+  </define>
+  <define name="ns">
+    <element name="ns">
+      <attribute name="uri">
+        <ref name="uriValue"/>
+      </attribute>
+      <attribute name="prefix">
+        <ref name="nameValue"/>
+      </attribute>
+      <ref name="foreign-empty"/>
+    </element>
+  </define>
+  <define name="p">
+    <element name="p">
+      <optional>
+        <attribute name="id">
+          <data type="ID"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="class">
+          <ref name="classValue"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="icon">
+          <ref name="uriValue"/>
+        </attribute>
+      </optional>
+      <interleave>
+        <ref name="foreign"/>
+        <zeroOrMore>
+          <choice>
+            <text/>
+            <ref name="dir"/>
+            <ref name="emph"/>
+            <ref name="span"/>
+          </choice>
+        </zeroOrMore>
+      </interleave>
+    </element>
+  </define>
+  <define name="param">
+    <element name="param">
+      <attribute name="name">
+        <ref name="nameValue"/>
+      </attribute>
+      <attribute name="value">
+        <ref name="non-empty-string"/>
+      </attribute>
+    </element>
+  </define>
+  <define name="pattern">
+    <element name="pattern">
+      <optional>
+        <attribute name="documents">
+          <ref name="pathValue"/>
+        </attribute>
+      </optional>
+      <ref name="rich"/>
+      <interleave>
+        <ref name="foreign"/>
+        <zeroOrMore>
+          <ref name="inclusion"/>
+        </zeroOrMore>
+        <choice>
+          <group>
+            <attribute name="abstract">
+              <value>true</value>
+            </attribute>
+            <attribute name="id">
+              <data type="ID"/>
+            </attribute>
+            <optional>
+              <ref name="title"/>
+            </optional>
+            <group>
+              <zeroOrMore>
+                <ref name="p"/>
+              </zeroOrMore>
+              <zeroOrMore>
+                <ref name="let"/>
+              </zeroOrMore>
+              <zeroOrMore>
+                <ref name="rule"/>
+              </zeroOrMore>
+            </group>
+          </group>
+          <group>
+            <optional>
+              <attribute name="abstract">
+                <value>false</value>
+              </attribute>
+            </optional>
+            <optional>
+              <attribute name="id">
+                <data type="ID"/>
+              </attribute>
+            </optional>
+            <optional>
+              <ref name="title"/>
+            </optional>
+            <group>
+              <zeroOrMore>
+                <ref name="p"/>
+              </zeroOrMore>
+              <zeroOrMore>
+                <ref name="let"/>
+              </zeroOrMore>
+              <zeroOrMore>
+                <ref name="rule"/>
+              </zeroOrMore>
+            </group>
+          </group>
+          <group>
+            <optional>
+              <attribute name="abstract">
+                <value>false</value>
+              </attribute>
+            </optional>
+            <attribute name="is-a">
+              <data type="IDREF"/>
+            </attribute>
+            <optional>
+              <attribute name="id">
+                <data type="ID"/>
+              </attribute>
+            </optional>
+            <optional>
+              <ref name="title"/>
+            </optional>
+            <group>
+              <zeroOrMore>
+                <ref name="p"/>
+              </zeroOrMore>
+              <zeroOrMore>
+                <ref name="param"/>
+              </zeroOrMore>
+            </group>
+          </group>
+        </choice>
+      </interleave>
+    </element>
+  </define>
+  <define name="phase">
+    <element name="phase">
+      <attribute name="id">
+        <data type="ID"/>
+      </attribute>
+      <ref name="rich"/>
+      <interleave>
+        <ref name="foreign"/>
+        <zeroOrMore>
+          <ref name="inclusion"/>
+        </zeroOrMore>
+        <group>
+          <zeroOrMore>
+            <ref name="p"/>
+          </zeroOrMore>
+          <zeroOrMore>
+            <ref name="let"/>
+          </zeroOrMore>
+          <zeroOrMore>
+            <ref name="active"/>
+          </zeroOrMore>
+        </group>
+      </interleave>
+    </element>
+  </define>
+  <define name="properties">
+    <element name="properties">
+      <zeroOrMore>
+        <ref name="property"/>
+      </zeroOrMore>
+    </element>
+  </define>
+  <define name="property">
+    <element name="property">
+      <attribute name="id">
+        <data type="ID"/>
+      </attribute>
+      <optional>
+        <attribute name="role">
+          <ref name="roleValue"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="scheme"/>
+      </optional>
+      <interleave>
+        <ref name="foreign"/>
+        <zeroOrMore>
+          <choice>
+            <text/>
+            <ref name="name"/>
+            <ref name="value-of"/>
+            <ref name="emph"/>
+            <ref name="dir"/>
+            <ref name="span"/>
+          </choice>
+        </zeroOrMore>
+      </interleave>
+    </element>
+  </define>
+  <define name="report">
+    <element name="report">
+      <attribute name="test">
+        <ref name="exprValue"/>
+      </attribute>
+      <optional>
+        <attribute name="flag">
+          <ref name="flagValue"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="id">
+          <data type="ID"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="diagnostics">
+          <data type="IDREFS"/>
+        </attribute>
+      </optional>
+      <optional>
+        <attribute name="properties">
+          <data type="IDREFS"/>
+        </attribute>
+      </optional>
+      <ref name="rich"/>
+      <ref name="linkable"/>
+      <interleave>
+        <ref name="foreign"/>
+        <zeroOrMore>
+          <choice>
+            <text/>
+            <ref name="name"/>
+            <ref name="value-of"/>
+            <ref name="emph"/>
+            <ref name="dir"/>
+            <ref name="span"/>
+          </choice>
+        </zeroOrMore>
+      </interleave>
+    </element>
+  </define>
+  <define name="rule">
+    <element name="rule">
+      <optional>
+        <attribute name="flag">
+          <ref name="flagValue"/>
+        </attribute>
+      </optional>
+      <ref name="rich"/>
+      <ref name="linkable"/>
+      <interleave>
+        <ref name="foreign"/>
+        <zeroOrMore>
+          <ref name="inclusion"/>
+        </zeroOrMore>
+        <choice>
+          <group>
+            <attribute name="abstract">
+              <value>true</value>
+            </attribute>
+            <attribute name="id">
+              <data type="ID"/>
+            </attribute>
+            <zeroOrMore>
+              <ref name="let"/>
+            </zeroOrMore>
+            <oneOrMore>
+              <choice>
+                <ref name="assert"/>
+                <ref name="report"/>
+                <ref name="extends"/>
+                <ref name="p"/>
+              </choice>
+            </oneOrMore>
+          </group>
+          <group>
+            <attribute name="context">
+              <ref name="pathValue"/>
+            </attribute>
+            <optional>
+              <attribute name="id">
+                <data type="ID"/>
+              </attribute>
+            </optional>
+            <optional>
+              <attribute name="abstract">
+                <value>false</value>
+              </attribute>
+            </optional>
+            <zeroOrMore>
+              <ref name="let"/>
+            </zeroOrMore>
+            <oneOrMore>
+              <choice>
+                <ref name="assert"/>
+                <ref name="report"/>
+                <ref name="extends"/>
+                <ref name="p"/>
+              </choice>
+            </oneOrMore>
+          </group>
+        </choice>
+      </interleave>
+    </element>
+  </define>
+  <define name="span">
+    <element name="span">
+      <attribute name="class">
+        <ref name="classValue"/>
+      </attribute>
+      <interleave>
+        <ref name="foreign"/>
+        <text/>
+      </interleave>
+    </element>
+  </define>
+  <define name="title">
+    <element name="title">
+      <zeroOrMore>
+        <choice>
+          <text/>
+          <ref name="dir"/>
+        </choice>
+      </zeroOrMore>
+    </element>
+  </define>
+  <define name="value-of">
+    <element name="value-of">
+      <attribute name="select">
+        <ref name="pathValue"/>
+      </attribute>
+      <ref name="foreign-empty"/>
+    </element>
+  </define>
+  <!-- common declarations -->
+  <define name="inclusion">
+    <element name="include">
+      <attribute name="href">
+        <ref name="uriValue"/>
+      </attribute>
+      <ref name="foreign-empty"/>
+    </element>
+  </define>
+  <define name="rich">
+    <optional>
+      <attribute name="icon">
+        <ref name="uriValue"/>
+      </attribute>
+    </optional>
+    <optional>
+      <attribute name="see">
+        <ref name="uriValue"/>
+      </attribute>
+    </optional>
+    <optional>
+      <attribute name="fpi">
+        <ref name="fpiValue"/>
+      </attribute>
+    </optional>
+    <optional>
+      <attribute name="xml:lang">
+        <ref name="langValue"/>
+      </attribute>
+    </optional>
+    <optional>
+      <attribute name="xml:space">
+        <choice>
+          <value>preserve</value>
+          <value>default</value>
+        </choice>
+      </attribute>
+    </optional>
+  </define>
+  <define name="linkable">
+    <optional>
+      <attribute name="role">
+        <ref name="roleValue"/>
+      </attribute>
+    </optional>
+    <optional>
+      <attribute name="subject">
+        <ref name="pathValue"/>
+      </attribute>
+    </optional>
+  </define>
+  <define name="foreign">
+    <ref name="foreign-attributes"/>
+    <zeroOrMore>
+      <ref name="foreign-element"/>
+    </zeroOrMore>
+  </define>
+  <define name="foreign-empty">
+    <ref name="foreign-attributes"/>
+  </define>
+  <define name="foreign-attributes">
+    <zeroOrMore>
+      <attribute>
+        <anyName>
+          <except>
+            <nsName ns=""/>
+            <nsName ns="http://www.w3.org/XML/1998/namespace"/>
+          </except>
+        </anyName>
+      </attribute>
+    </zeroOrMore>
+  </define>
+  <define name="foreign-element">
+    <element>
+      <anyName>
+        <except>
+          <nsName/>
+        </except>
+      </anyName>
+      <zeroOrMore>
+        <choice>
+          <attribute>
+            <anyName/>
+          </attribute>
+          <ref name="foreign-element"/>
+          <ref name="schema"/>
+          <text/>
+        </choice>
+      </zeroOrMore>
+    </element>
+  </define>
+  <!-- Data types -->
+  <define name="uriValue">
+    <data type="anyURI"/>
+  </define>
+  <define name="pathValue">
+    <data type="string" datatypeLibrary=""/>
+  </define>
+  <define name="exprValue">
+    <data type="string" datatypeLibrary=""/>
+  </define>
+  <define name="fpiValue">
+    <data type="string" datatypeLibrary=""/>
+  </define>
+  <define name="langValue">
+    <data type="language"/>
+  </define>
+  <define name="roleValue">
+    <data type="string" datatypeLibrary=""/>
+  </define>
+  <define name="flagValue">
+    <data type="string" datatypeLibrary=""/>
+  </define>
+  <define name="nameValue">
+    <data type="string" datatypeLibrary=""/>
+  </define>
+  <!-- In the default query language binding, xsd:NCNAME -->
+  <define name="classValue">
+    <data type="string" datatypeLibrary=""/>
+  </define>
+  <define name="non-empty-string">
+    <data type="token">
+      <param name="minLength">1</param>
+    </data>
+  </define>
+</grammar>
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl
new file mode 100644
index 0000000000000000000000000000000000000000..21a5d2a069cab9fa327d9a3cd4e4d56c21bb10db
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl
@@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+	Stylesheet for extracting Schematron information from a RELAX-NG schema.
+	Based on the stylesheet for extracting Schematron information from W3C XML Schema.
+	Created by Eddie Robertsson 2002/06/01
+        2009/12/10      hj: changed Schematron namespace to ISO URI (Holger Joukl)
+-->
+<xsl:transform version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" 
+xmlns:sch="http://purl.oclc.org/dsdl/schematron" xmlns:rng="http://relaxng.org/ns/structure/1.0">
+	<!-- Set the output to be XML with an XML declaration and use indentation -->
+	<xsl:output method="xml" omit-xml-declaration="no" indent="yes" standalone="yes"/>
+	<!-- -->
+	<!-- match schema and call recursive template to extract included schemas -->
+	<!-- -->
+	<xsl:template match="/rng:grammar | /rng:element">
+		<!-- call the schema definition template ... -->
+		<xsl:call-template name="gatherSchema">
+			<!-- ... with current node as the $schemas parameter ... -->
+			<xsl:with-param name="schemas" select="."/>
+			<!-- ... and any includes in the $include parameter -->
+			<xsl:with-param name="includes" select="document(/rng:grammar/rng:include/@href
+| //rng:externalRef/@href)"/>
+		</xsl:call-template>
+	</xsl:template>
+	<!-- -->
+	<!-- gather all included schemas into a single parameter variable -->
+	<!-- -->
+	<xsl:template name="gatherSchema">
+		<xsl:param name="schemas"/>
+		<xsl:param name="includes"/>
+		<xsl:choose>
+			<xsl:when test="count($schemas) &lt; count($schemas | $includes)">
+				<!-- when $includes includes something new, recurse ... -->
+				<xsl:call-template name="gatherSchema">
+					<!-- ... with current $includes added to the $schemas parameter ... -->
+					<xsl:with-param name="schemas" select="$schemas | $includes"/>
+					<!-- ... and any *new* includes in the $include parameter -->
+					<xsl:with-param name="includes" select="document($includes/rng:grammar/rng:include/@href
+| $includes//rng:externalRef/@href)"/>
+				</xsl:call-template>
+			</xsl:when>
+			<xsl:otherwise>
+				<!-- we have the complete set of included schemas, so now let's output the embedded schematron -->
+				<xsl:call-template name="output">
+					<xsl:with-param name="schemas" select="$schemas"/>
+				</xsl:call-template>
+			</xsl:otherwise>
+		</xsl:choose>
+	</xsl:template>
+	<!-- -->
+	<!-- output the schematron information -->
+	<!-- -->
+	<xsl:template name="output">
+		<xsl:param name="schemas"/>
+		<!-- -->
+		<sch:schema>
+			<!-- get header-type elements - eg title and especially ns -->
+			<!-- title (just one) -->
+			<xsl:copy-of select="$schemas//sch:title[1]"/>
+			<!-- get remaining schematron schema children -->
+			<!-- get non-blank namespace elements, dropping duplicates -->
+			<xsl:for-each select="$schemas//sch:ns">
+				<xsl:if test="generate-id(.) = generate-id($schemas//sch:ns[@prefix = current()/@prefix][1])">
+					<xsl:copy-of select="."/>
+				</xsl:if>
+			</xsl:for-each>
+			<xsl:copy-of select="$schemas//sch:phase"/>
+			<xsl:copy-of select="$schemas//sch:pattern"/>
+			<sch:diagnostics>
+				<xsl:copy-of select="$schemas//sch:diagnostics/*"/>
+			</sch:diagnostics>
+		</sch:schema>
+	</xsl:template>
+	<!-- -->
+</xsl:transform>
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl
new file mode 100644
index 0000000000000000000000000000000000000000..de0c9ea700d20c78111660e5fe8bf4ddc5a88137
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl
@@ -0,0 +1,77 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+        based on an original transform by Eddie Robertsson
+        2001/04/21      fn: added support for included schemas
+        2001/06/27      er: changed XMl Schema prefix from xsd: to xs: and changed to the Rec namespace
+        2009/12/10      hj: changed Schematron namespace to ISO URI (Holger Joukl)
+-->
+<xsl:transform version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" 
+xmlns:sch="http://purl.oclc.org/dsdl/schematron" xmlns:xs="http://www.w3.org/2001/XMLSchema">
+        <!-- Set the output to be XML with an XML declaration and use indentation -->
+        <xsl:output method="xml" omit-xml-declaration="no" indent="yes" standalone="yes"/>
+        <!-- -->
+        <!-- match schema and call recursive template to extract included schemas -->
+        <!-- -->
+        <xsl:template match="xs:schema">
+                <!-- call the schema definition template ... -->
+                <xsl:call-template name="gatherSchema">
+                        <!-- ... with current current root as the $schemas parameter ... -->
+                        <xsl:with-param name="schemas" select="/"/>
+                        <!-- ... and any includes in the $include parameter -->
+                        <xsl:with-param name="includes" 
+						select="document(/xs:schema/xs:*[self::xs:include or self::xs:import or self::xs:redefine]/@schemaLocation)"/>
+                </xsl:call-template>
+        </xsl:template>
+        <!-- -->
+        <!-- gather all included schemas into a single parameter variable -->
+        <!-- -->
+        <xsl:template name="gatherSchema">
+                <xsl:param name="schemas"/>
+                <xsl:param name="includes"/>
+                <xsl:choose>
+                        <xsl:when test="count($schemas) &lt; count($schemas | $includes)">
+                                <!-- when $includes includes something new, recurse ... -->
+                                <xsl:call-template name="gatherSchema">
+                                        <!-- ... with current $includes added to the $schemas parameter ... -->
+                                        <xsl:with-param name="schemas" select="$schemas | $includes"/>
+                                        <!-- ... and any *new* includes in the $include parameter -->
+                                        <xsl:with-param name="includes" 
+										select="document($includes/xs:schema/xs:*[self::xs:include or self::xs:import or self::xs:redefine]/@schemaLocation)"/>
+                                </xsl:call-template>
+                        </xsl:when>
+                        <xsl:otherwise>
+                                <!-- we have the complete set of included schemas, 
+								so now let's output the embedded schematron -->
+                                <xsl:call-template name="output">
+                                        <xsl:with-param name="schemas" select="$schemas"/>
+                                </xsl:call-template>
+                        </xsl:otherwise>
+                </xsl:choose>
+        </xsl:template>
+        <!-- -->
+        <!-- output the schematron information -->
+        <!-- -->
+        <xsl:template name="output">
+                <xsl:param name="schemas"/>
+                <!-- -->
+                <sch:schema>
+                        <!-- get header-type elements - eg title and especially ns -->
+                        <!-- title (just one) -->
+                        <xsl:copy-of select="$schemas//xs:appinfo/sch:title[1]"/>
+                        <!-- get remaining schematron schema children -->
+                        <!-- get non-blank namespace elements, dropping duplicates -->
+                        <xsl:for-each select="$schemas//xs:appinfo/sch:ns">
+                                <xsl:if test="generate-id(.) = 
+								generate-id($schemas//xs:appinfo/sch:ns[@prefix = current()/@prefix][1])">
+                                        <xsl:copy-of select="."/>
+                                </xsl:if>
+                        </xsl:for-each>
+                        <xsl:copy-of select="$schemas//xs:appinfo/sch:phase"/>
+                        <xsl:copy-of select="$schemas//xs:appinfo/sch:pattern"/>
+                        <sch:diagnostics>
+                                <xsl:copy-of select="$schemas//xs:appinfo/sch:diagnostics/*"/>
+                        </sch:diagnostics>
+                </sch:schema>
+        </xsl:template>
+        <!-- -->
+</xsl:transform>
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl
new file mode 100644
index 0000000000000000000000000000000000000000..5018395234799dd65d53a339daaddf445a09dbea
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl
@@ -0,0 +1,313 @@
+<?xml version="1.0" encoding="UTF-8"?><?xar XSLT?>
+
+<!-- 
+     OVERVIEW - iso_abstract_expand.xsl
+     
+	    This is a preprocessor for ISO Schematron, which implements abstract patterns. 
+	    It also 
+	       	* extracts a particular schema using an ID, where there are multiple 
+             schemas, such as when they are embedded in the same NVDL script 
+           * allows parameter substitution inside @context, @test, @select, @path
+	    	   * experimentally, allows parameter recognition and substitution inside
+             text (NOTE: to be removed, for compataibility with other implementations,   
+             please do not use this) 
+		
+		This should be used after iso-dsdl-include.xsl and before the skeleton or
+		meta-stylesheet (e.g. iso-svrl.xsl) . It only requires XSLT 1.
+		 
+		Each kind of inclusion can be turned off (or on) on the command line.
+		 
+-->
+
+<!--
+Open Source Initiative OSI - The MIT License:Licensing
+[OSI Approved License]
+
+This source code was previously available under the zlib/libpng license. 
+Attribution is polite.
+
+The MIT License
+
+Copyright (c) 2004-2010  Rick Jellife and Academia Sinica Computing Centre, Taiwan
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+-->
+
+<!--
+VERSION INFORMATION
+  2013-09-19 RJ
+     * Allow macro expansion in  @path attributes, eg. for   sch:name/@path
+
+  2010-07-10 RJ
+  		* Move to MIT license
+  		
+  2008-09-18 RJ
+  		* move out param test from iso:schema template  to work with XSLT 1. (Noah Fontes)
+  		
+  2008-07-29 RJ 
+  		* Create.  Pull out as distinct XSL in its own namespace from old iso_pre_pro.xsl
+  		* Put everything in private namespace
+  		* Rewrite replace_substring named template so that copyright is clear
+  	
+  2008-07-24 RJ
+       * correct abstract patterns so for correct names: param/@name and
+     param/@value
+    
+  2007-01-12  RJ 
+     * Use ISO namespace
+     * Use pattern/@id not  pattern/@name 
+     * Add Oliver Becker's suggests from old Schematron-love-in list for <copy> 
+     * Add XT -ism?
+  2003 RJ
+     * Original written for old namespace
+     * http://www.topologi.com/resources/iso-pre-pro.xsl
+-->	
+<xslt:stylesheet version="1.0" xmlns:xslt="http://www.w3.org/1999/XSL/Transform" 
+	xmlns:xsl="http://www.w3.org/1999/XSL/Transform" 
+    xmlns:iso="http://purl.oclc.org/dsdl/schematron"  
+    xmlns:nvdl="http://purl.oclc.org/dsdl/nvdl"  
+    
+    xmlns:iae="http://www.schematron.com/namespace/iae" 
+     
+      >
+	
+	<xslt:param name="schema-id"></xslt:param>
+	
+	
+	<!-- Driver for the mode -->
+	<xsl:template match="/">
+  		<xsl:apply-templates select="." mode="iae:go" />
+	</xsl:template> 
+	
+	
+	<!-- ================================================================================== -->
+	<!-- Normal processing rules                                                            -->
+	<!-- ================================================================================== -->
+	<!-- Output only the selected schema --> 
+	<xslt:template match="iso:schema" >
+	    <xsl:if test="string-length($schema-id) =0 or @id= $schema-id ">
+	    	<xslt:copy>
+				<xslt:copy-of select="@*" />
+				<xslt:apply-templates  mode="iae:go" /> 
+			</xslt:copy>
+		</xsl:if>
+	</xslt:template>
+	
+ 
+	<!-- Strip out any foreign elements above the Schematron schema .
+		-->
+	<xslt:template match="*[not(ancestor-or-self::iso:*)]"     mode="iae:go"  >
+	   <xslt:apply-templates  mode="iae:go" />
+	</xslt:template>
+	   
+	
+	<!-- ================================================================================== -->
+	<!-- Handle Schematron abstract pattern preprocessing                                   -->
+	<!-- abstract-to-real calls
+			do-pattern calls 
+				macro-expand calls 
+					multi-macro-expand
+						replace-substring                                                   -->
+	<!-- ================================================================================== -->
+	
+	<!--
+		Abstract patterns allow you to say, for example
+		
+		<pattern name="htmlTable" is-a="table">
+			<param name="row" value="html:tr"/>
+			<param name="cell" value="html:td" />
+			<param name="table" value="html:table" />
+		</pattern>
+		
+		For a good introduction, see Uche Ogbujii's article for IBM DeveloperWorks
+		"Discover the flexibility of Schematron abstract patterns"
+		  http://www-128.ibm.com/developerworks/xml/library/x-stron.html
+		However, note that ISO Schematron uses @name and @value attributes on
+		the iso:param element, and @id not @name on the pattern element.
+		
+	-->
+	
+	<!-- Suppress declarations of abstract patterns -->
+	<xslt:template match="iso:pattern[@abstract='true']"  mode="iae:go"  >
+		<xslt:comment>Suppressed abstract pattern <xslt:value-of select="@id"/> was here</xslt:comment>	
+	</xslt:template> 
+	
+	
+	<!-- Suppress uses of abstract patterns -->
+	<xslt:template match="iso:pattern[@is-a]"  mode="iae:go" >
+			
+		<xslt:comment>Start pattern based on abstract <xslt:value-of select="@is-a"/></xslt:comment>
+		
+		<xslt:call-template name="iae:abstract-to-real" >
+			<xslt:with-param name="caller" select="@id" />
+			<xslt:with-param name="is-a" select="@is-a" />
+		</xslt:call-template>
+			
+	</xslt:template>
+	 
+	 
+	
+	<!-- output everything else unchanged -->
+	<xslt:template match="*" priority="-1"  mode="iae:go" >
+	    <xslt:copy>
+			<xslt:copy-of select="@*" />
+			<xslt:apply-templates mode="iae:go"/> 
+		</xslt:copy>
+	</xslt:template>
+	
+	<!-- Templates for macro expansion of abstract patterns -->
+	<!-- Sets up the initial conditions for the recursive call -->
+	<xslt:template name="iae:macro-expand">
+		<xslt:param name="caller"/>
+		<xslt:param name="text" />
+		<xslt:call-template name="iae:multi-macro-expand">
+			<xslt:with-param name="caller" select="$caller"/>
+			<xslt:with-param name="text" select="$text"/>
+			<xslt:with-param name="paramNumber" select="1"/>
+		</xslt:call-template>
+		
+	</xslt:template>
+	
+	<!-- Template to replace the current parameter and then
+	   recurse to replace subsequent parameters. -->
+	    
+	<xslt:template name="iae:multi-macro-expand">
+		<xslt:param name="caller"/>
+		<xslt:param name="text" />
+		<xslt:param name="paramNumber" />
+
+		
+		<xslt:choose>
+			<xslt:when test="//iso:pattern[@id=$caller]/iso:param[ $paramNumber]">
+
+				<xslt:call-template name="iae:multi-macro-expand">
+					<xslt:with-param name="caller" select="$caller"/>	
+					<xslt:with-param name="paramNumber" select="$paramNumber + 1"/>		
+					<xslt:with-param name="text" >
+						<xslt:call-template name="iae:replace-substring">
+							<xslt:with-param name="original" select="$text"/>
+							<xslt:with-param name="substring"
+							select="concat('$', //iso:pattern[@id=$caller]/iso:param[ $paramNumber ]/@name)"/>
+							<xslt:with-param name="replacement"
+								select="//iso:pattern[@id=$caller]/iso:param[ $paramNumber ]/@value"/>			
+						</xslt:call-template>
+					</xslt:with-param>						
+				</xslt:call-template>
+			</xslt:when>
+			<xslt:otherwise><xslt:value-of select="$text" /></xslt:otherwise>		
+		
+		</xslt:choose>
+	</xslt:template>
+	
+	
+	<!-- generate the real pattern from an abstract pattern + parameters-->
+	<xslt:template name="iae:abstract-to-real" >
+		<xslt:param name="caller"/>
+		<xslt:param name="is-a" />
+		<xslt:for-each select="//iso:pattern[@id= $is-a]">
+		<xslt:copy>
+		
+		    <xslt:choose>
+		      <xslt:when test=" string-length( $caller ) = 0">
+		      <xslt:attribute name="id"><xslt:value-of select="concat( generate-id(.) , $is-a)" /></xslt:attribute>
+		      </xslt:when>
+		      <xslt:otherwise>
+				<xslt:attribute name="id"><xslt:value-of select="$caller" /></xslt:attribute>
+		      </xslt:otherwise>
+		    </xslt:choose> 
+			
+			<xslt:apply-templates select="*|text()" mode="iae:do-pattern"    >
+				<xslt:with-param name="caller"><xslt:value-of select="$caller"/></xslt:with-param>
+			</xslt:apply-templates>	
+			
+		</xslt:copy>
+		</xslt:for-each>
+	</xslt:template>
+		
+	
+	<!-- Generate a non-abstract pattern -->
+	<xslt:template mode="iae:do-pattern" match="*">
+		<xslt:param name="caller"/>
+		<xslt:copy>
+			<xslt:for-each select="@*[name()='test' or name()='context' or name()='select'   or name()='path'  ]">
+				<xslt:attribute name="{name()}">
+				<xslt:call-template name="iae:macro-expand">
+						<xslt:with-param name="text"><xslt:value-of select="."/></xslt:with-param>
+						<xslt:with-param name="caller"><xslt:value-of select="$caller"/></xslt:with-param>
+					</xslt:call-template>
+				</xslt:attribute>
+			</xslt:for-each>	
+			<xslt:copy-of select="@*[name()!='test'][name()!='context'][name()!='select'][name()!='path']" />
+			<xsl:for-each select="node()">
+				<xsl:choose>
+				    <!-- Experiment: replace macros in text as well, to allow parameterized assertions
+				        and so on, without having to have spurious <iso:value-of> calls and multiple
+				        delimiting.
+                NOTE: THIS FUNCTIONALITY WILL BE REMOVED IN THE FUTURE    -->
+					<xsl:when test="self::text()">	
+						<xslt:call-template name="iae:macro-expand">
+							<xslt:with-param name="text"><xslt:value-of select="."/></xslt:with-param>
+							<xslt:with-param name="caller"><xslt:value-of select="$caller"/></xslt:with-param>
+						</xslt:call-template>
+					</xsl:when>
+					<xsl:otherwise>
+						<xslt:apply-templates select="." mode="iae:do-pattern">
+							<xslt:with-param name="caller"><xslt:value-of select="$caller"/></xslt:with-param>
+						</xslt:apply-templates>		
+					</xsl:otherwise>
+				</xsl:choose>
+			</xsl:for-each>			
+		</xslt:copy>
+	</xslt:template>
+	
+	<!-- UTILITIES --> 
+	<!-- Simple version of replace-substring function -->
+	<xslt:template name="iae:replace-substring">
+		<xslt:param name="original" />    
+		<xslt:param name="substring" />   
+		<xslt:param name="replacement" select="''"/>
+		
+  <xsl:choose>
+    <xsl:when test="not($original)" /> 
+    <xsl:when test="not(string($substring))">
+      <xsl:value-of select="$original" />
+    </xsl:when> 
+        <xsl:when test="contains($original, $substring)">
+          <xsl:variable name="before" select="substring-before($original, $substring)" />
+          <xsl:variable name="after" select="substring-after($original, $substring)" />
+          
+          <xsl:value-of select="$before" />
+          <xsl:value-of select="$replacement" />
+          <!-- recursion -->
+          <xsl:call-template name="iae:replace-substring">
+            <xsl:with-param name="original" select="$after" />
+            <xsl:with-param name="substring" select="$substring" />
+            <xsl:with-param name="replacement" select="$replacement" /> 
+            </xsl:call-template>
+        </xsl:when>
+        <xsl:otherwise>
+        	<!-- no substitution -->
+        	<xsl:value-of select="$original" />
+        </xsl:otherwise>
+      </xsl:choose> 
+</xslt:template>
+
+
+
+</xslt:stylesheet>
\ No newline at end of file
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl
new file mode 100644
index 0000000000000000000000000000000000000000..44e5573b73077e015d404935479bdc9344c5ea4d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl
@@ -0,0 +1,1160 @@
+<?xml version="1.0" encoding="UTF-8"?><?xar XSLT?>
+
+<!-- 
+     OVERVIEW : iso_dsdl_include.xsl
+     
+	    This is an inclusion preprocessor for the non-smart text inclusions
+	    of ISO DSDL. It handles 
+	    	<relax:extRef> for ISO RELAX NG
+	    	<sch:include>  for ISO Schematron and Schematron 1.n
+	    	<sch:extends>  for 2009 draft ISO Schematron
+	    	<xi:xinclude>  simple W3C XIncludes for ISO NVRL and DSRL 
+	    	<crdl:ref>     for draft ISO CRDL
+	    	<dtll:include> for draft ISO DTLL
+	    	<* @xlink:href> for simple W3C XLink 1.1 embedded links
+	    	
+		 
+		This should be the first in any chain of processing. It only requires
+		XSLT 1. Each kind of inclusion can be turned off (or on) on the command line.
+		
+		Ids in fragment identifiers or xpointers will be sought in the following
+		order:
+		    * @xml:id
+		    * id() for typed schemas (e.g. from DTD) [NOTE: XInclude does not support this]
+		    * untyped @id 
+		    
+	The proposed behaviour for the update to ISO Schematron has been implemented. If an
+	include points to an element with the same name as the parent, then that element's
+	contents will be included. This supports the merge style of inclusion.    
+	
+	When an inclusion is made, it is preceded by a PI with target DSDL_INCLUDE_START
+	and the href and closed by a PI with target DSDL_INCLUDE_START and the href. This is
+	to allow better location of problems, though only to the file level. 
+	
+	Limitations:
+	* No rebasing: relative paths will be interpreted based on the initial document's
+	path, not the including document. (Severe limitation!)
+	* No checking for circular references
+	* Not full xpointers: only ID matching
+	* <relax:include> not implemented 
+	* XInclude handling of xml:base and xml:lang not implemented   
+-->
+<!-- 
+  VERSION INFORMATION
+	2009-02-25 
+	* Update DSDL namespace to use schematron.com
+	* Tested with SAXON9, Xalan 2.7.1, IE7, 
+	* IE does not like multiple variables in same template with same name: rename.   
+	2008-09-18
+	* Remove new behaviour for include, because it conflicts with existing usage [KH]
+	* Add extends[@href] element with that merge functionality
+	* Generate PIs to notate source of inclusions for potential better diagnostics
+	
+	2008-09-16
+	* Fix for XSLT1
+	
+	2008-08-28
+	* New behaviour for schematron includes: if the pointed to element is the same as the current,
+	include the children.
+	
+	2008-08-20
+	* Fix bug: in XSLT1 cannot do $document/id('x') but need to use for-each
+	
+	2008-08-04
+	* Add support for inclusions in old namespace  
+	
+	2008-08-03
+	* Fix wrong param name include-relaxng & include-crdl (KH, PH)
+	* Allow inclusion of XSLT and XHTML (KH)
+	* Fix inclusion of fragments (KH)
+	
+	2008-07-25
+	* Add selectable input parameter
+	
+	2008-07-24  
+	* RJ New
+-->
+<!--
+	LEGAL INFORMATION
+	
+	Copyright (c) 2008 Rick Jelliffe 
+	
+	This software is provided 'as-is', without any express or implied warranty. 
+	In no event will the authors be held liable for any damages arising from 
+	the use of this software.
+	
+	Permission is granted to anyone to use this software for any purpose, 
+	including commercial applications, and to alter it and redistribute it freely,
+	subject to the following restrictions:
+	
+	1. The origin of this software must not be misrepresented; you must not claim
+	that you wrote the original software. If you use this software in a product, 
+	an acknowledgment in the product documentation would be appreciated but is 
+	not required.
+	
+	2. Altered source versions must be plainly marked as such, and must not be 
+	misrepresented as being the original software.
+	
+	3. This notice may not be removed or altered from any source distribution.
+-->
+<xslt:stylesheet version="1.0"
+	xmlns:xslt="http://www.w3.org/1999/XSL/Transform"
+	xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+	xmlns:iso="http://purl.oclc.org/dsdl/schematron"
+	xmlns:nvdl="http://purl.oclc.org/dsdl/nvdl"
+	xmlns:xhtml="http://www.w3.org/1999/xhtml"
+	xmlns:schold="http://www.ascc.net/xml/schematron"
+	xmlns:crdl="http://purl.oclc.org/dsdl/crepdl/ns/structure/1.0"
+	xmlns:xi="http://www.w3.org/2001/XInclude"
+	xmlns:dtll="http://www.jenitennison.com/datatypes"
+	xmlns:dsdl="http://www.schematron.com/namespace/dsdl"
+	xmlns:relax="http://relaxng.org/ns/structure/1.0"
+	xmlns:xlink="http://www.w3.org/1999/xlink">
+	<!-- Note: The URL for the dsdl namespace is not official -->
+
+
+	<xsl:param name="include-schematron">true</xsl:param>
+	<xsl:param name="include-crdl">true</xsl:param>
+	<xsl:param name="include-xinclude">true</xsl:param>
+	<xsl:param name="include-dtll">true</xsl:param>
+	<xsl:param name="include-relaxng">true</xsl:param>
+	<xsl:param name="include-xlink">true</xsl:param>
+
+	<xsl:template match="/">
+		<xsl:apply-templates select="." mode="dsdl:go" />
+	</xsl:template>
+
+	<!-- output everything else unchanged -->
+	<xslt:template match="node()" priority="-1" mode="dsdl:go">
+		<xslt:copy>
+			<xslt:copy-of select="@*" />
+			<xslt:apply-templates mode="dsdl:go" />
+		</xslt:copy>
+	</xslt:template>
+
+
+
+	<!-- =========================================================== -->
+	<!-- ISO/IEC 19757 - DSDL Document Schema Definition Languages   -->
+	<!-- Part 2 - Regular grammar-based validation - RELAX NG        -->
+	<!-- This only implements relax:extRef not relax:include which   -->
+	<!-- is complex.                                                 -->
+	<!-- =========================================================== -->
+	<xslt:template match="relax:extRef" mode="dsdl:go">
+
+
+		<!-- Insert subschema -->
+
+		<xsl:variable name="document-uri"
+			select="substring-before(concat(@href,'#'), '#')" />
+		<xsl:variable name="fragment-id"
+			select="substring-after(@href, '#')" />
+
+		<xsl:processing-instruction name="DSDL_INCLUDE_START">
+			<xsl:value-of select="@href" />
+		</xsl:processing-instruction>
+		<xsl:choose>
+			<xsl:when test="not( $include-relaxng = 'true' )">
+				<xslt:copy>
+					<xslt:copy-of select="@*" />
+					<xslt:apply-templates mode="dsdl:go" />
+				</xslt:copy>
+			</xsl:when>
+			<xsl:otherwise>
+
+				<xsl:choose>
+
+					<xsl:when
+						test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0">
+						<xsl:message>
+							Error: Impossible URL in RELAX NG extRef
+							include
+						</xsl:message>
+					</xsl:when>
+
+					<!-- this case is when there is in embedded schema in the same document elsewhere -->
+					<xslt:when
+						test="string-length( $document-uri ) = 0">
+						<xslt:apply-templates mode="dsdl:go"
+							select="//*[@xml:id= $fragment-id ] | id( $fragment-id) | //*[@id= $fragment-id ]" />
+					</xslt:when>
+
+					<xsl:when
+						test="string-length( $fragment-id ) &gt; 0">
+						<xsl:variable name="theDocument_1"
+							select="document( $document-uri,/ )" />
+
+						<xsl:if test="not($theDocument_1)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to open referenced included file: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+						<!-- use a for-each so that the id() function works correctly on the external document -->
+						<xsl:for-each select="$theDocument_1">
+							<xsl:variable name="theFragment_1"
+								select="$theDocument_1//*[@xml:id= $fragment-id ]        
+                  |  id( $fragment-id)          
+              | $theDocument_1//*[@id= $fragment-id ]" />
+							<xsl:if test="not($theFragment_1)">
+								<xsl:message terminate="no">
+									<xsl:text>Unable to locate id attribute: </xsl:text>
+									<xsl:value-of select="@href" />
+								</xsl:message>
+							</xsl:if>
+							<xsl:apply-templates
+								select=" $theFragment_1[1]" mode="dsdl:go" />
+						</xsl:for-each>
+					</xsl:when>
+
+					<xsl:otherwise>
+						<xsl:variable name="theDocument_2"
+							select="document( $document-uri,/ )" />
+						<xsl:variable name="theFragment_2"
+							select="$theDocument_2/*" />
+						<xsl:if test="not($theDocument_2)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to open referenced included file: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+
+						<xsl:if test="not($theFragment_2)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to locate id attribute: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+						<xsl:apply-templates select="$theFragment_2 "
+							mode="dsdl:go" />
+					</xsl:otherwise>
+				</xsl:choose>
+
+			</xsl:otherwise>
+		</xsl:choose>
+
+		<xsl:processing-instruction name="DSDL_INCLUDE_END">
+			<xsl:value-of select="@href" />
+		</xsl:processing-instruction>
+	</xslt:template>
+
+
+
+	<!-- =========================================================== -->
+	<!-- ISO/IEC 19757 - DSDL Document Schema Definition Languages   -->
+	<!-- Part 3 - Rule-based validation - Schematron                 -->
+	<!-- =========================================================== -->
+
+
+	<!-- Extend the URI syntax to allow # references -->
+	<!-- Add experimental support for simple containers like  /xxx:xxx/iso:pattern to allow better includes -->
+	<xsl:template match="iso:include" mode="dsdl:go">
+
+		<xsl:variable name="document-uri"
+			select="substring-before(concat(@href,'#'), '#')" />
+		<xsl:variable name="fragment-id"
+			select="substring-after(@href, '#')" />
+
+
+		<xsl:processing-instruction name="DSDL_INCLUDE_START">
+			<xsl:value-of select="@href" />
+		</xsl:processing-instruction>
+
+		<xsl:choose>
+			<xsl:when test="not( $include-schematron = 'true' )">
+				<xslt:copy>
+					<xslt:copy-of select="@*" />
+					<xslt:apply-templates mode="dsdl:go" />
+				</xslt:copy>
+			</xsl:when>
+			<xsl:otherwise>
+
+				<xsl:choose>
+
+					<xsl:when
+						test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0">
+						<xsl:message>
+							Error: Impossible URL in Schematron include
+						</xsl:message>
+					</xsl:when>
+
+					<!-- this case is when there is in embedded schema in the same document elsewhere -->
+					<xslt:when
+						test="string-length( $document-uri ) = 0">
+						<xslt:apply-templates mode="dsdl:go"
+							select="//iso:*[@xml:id= $fragment-id ] 
+              	 |id( $fragment-id)
+              	 | //iso:*[@id= $fragment-id ]" />
+					</xslt:when>
+
+					<!-- case where there is a fragment in another document (should be an iso: element) -->
+					<!-- There are three cases for includes with fragment:
+						0) No href file or no matching id - error!
+						1) REMOVED
+						
+						2) The linked-to element is sch:schema however the parent of the include
+						is not a schema. In this case, it is an error. (Actually, it should
+						be an error for other kinds of containment problems, but we won't
+						check for them in this version.)
+						
+						3) Otherwise, include the pointed-to element
+					-->
+
+					<xsl:when
+						test="string-length( $fragment-id ) &gt; 0">
+						<xsl:variable name="theDocument_1"
+							select="document( $document-uri,/ )" />
+						<xsl:variable name="originalParent" select=".." />
+
+						<!-- case 0 -->
+						<xsl:if test="not($theDocument_1)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to open referenced included file: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+						<!-- use for-each to rebase id() to external document -->
+						<xsl:for-each select="$theDocument_1">
+							<xsl:variable name="theFragment_1"
+								select=" $theDocument_1//iso:*[@xml:id= $fragment-id ] |
+              	 		id($fragment-id) |
+              			$theDocument_1//iso:*[@id= $fragment-id ]" />
+
+
+							<xsl:choose>
+								<!-- case 0 -->
+								<xsl:when test="not($theFragment_1)">
+									<xsl:message terminate="no">
+										<xsl:text>Unable to locate id attribute: </xsl:text>
+										<xsl:value-of select="@href" />
+									</xsl:message>
+								</xsl:when>
+
+
+								<!-- case 1 REMOVED -->
+
+								<!-- case 2 -->
+								<xsl:when
+									test=" $theFragment_1/self::iso:schema ">
+									<xsl:message>
+										Schema error: Use include to
+										include fragments, not a whole
+										schema
+									</xsl:message>
+								</xsl:when>
+
+								<!-- case 3 -->
+								<xsl:otherwise>
+									<xsl:apply-templates
+										select=" $theFragment_1[1]" mode="dsdl:go" />
+								</xsl:otherwise>
+							</xsl:choose>
+						</xsl:for-each>
+					</xsl:when>
+
+					<!-- Case where there is no ID so we include the whole document -->
+					<!-- Experimental addition: include fragments of children -->
+					<xsl:otherwise>
+						<xsl:variable name="theDocument_2"
+							select="document( $document-uri,/ )" />
+						<xsl:variable name="theFragment_2"
+							select="$theDocument_2/iso:*" />
+						<xsl:variable name="theContainedFragments"
+							select="$theDocument_2/*/iso:* | $theDocument_2/*/xsl:* | $theDocument_2/*/xhtml:*" />
+						<xsl:if test="not($theDocument_2)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to open referenced included file: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+
+						<!-- There are three cases for includes:
+							0) No text specified- error!
+							
+							1) REMOVED
+							
+							2) The linked-to element is sch:schema however the parent of the include
+							is not a schema. In this case, it is an error. (Actually, it should
+							be an error for other kinds of containment problems, but we won't
+							check for them in this version.)
+							
+							3) Otherwise, include the pointed-to element
+						-->
+						<xsl:choose>
+							<!-- case 0 -->
+							<xsl:when
+								test="not($theFragment_2) and not ($theContainedFragments)">
+								<xsl:message terminate="no">
+									<xsl:text>Unable to locate id attribute: </xsl:text>
+									<xsl:value-of select="@href" />
+								</xsl:message>
+							</xsl:when>
+
+							<!-- case 1 removed -->
+
+							<!-- case 2 -->
+							<xsl:when
+								test=" $theFragment_2/self::iso:schema or $theContainedFragments/self::iso:schema">
+								<xsl:message>
+									Schema error: Use include to include
+									fragments, not a whole schema
+								</xsl:message>
+							</xsl:when>
+
+							<!-- If this were XLST 2, we could use  
+								if ($theFragment) then $theFragment else $theContainedFragments
+								here (thanks to KN)
+							-->
+							<!-- case 3 -->
+							<xsl:otherwise>
+								<xsl:apply-templates
+									select="$theFragment_2 " mode="dsdl:go" />
+							</xsl:otherwise>
+						</xsl:choose>
+					</xsl:otherwise>
+				</xsl:choose>
+			</xsl:otherwise>
+		</xsl:choose>
+
+		<xsl:processing-instruction name="DSDL_INCLUDE_END">
+			<xsl:value-of select="@href" />
+		</xsl:processing-instruction>
+	</xsl:template>
+
+
+	<!-- WARNING   sch:extends[@href] is experimental and non standard  -->
+	<!-- Basically, it adds the children of the selected element, not the element itself.  -->
+	<xsl:template match="iso:extends[@href]" mode="dsdl:go">
+
+		<xsl:variable name="document-uri"
+			select="substring-before(concat(@href,'#'), '#')" />
+		<xsl:variable name="fragment-id"
+			select="substring-after(@href, '#')" />
+
+
+		<xsl:processing-instruction name="DSDL_INCLUDE_START">
+			<xsl:value-of select="@href" />
+		</xsl:processing-instruction>
+
+		<xsl:choose>
+			<xsl:when test="not( $include-schematron = 'true' )">
+				<xslt:copy>
+					<xslt:copy-of select="@*" />
+					<xslt:apply-templates mode="dsdl:go" />
+				</xslt:copy>
+			</xsl:when>
+			<xsl:otherwise>
+
+				<xsl:choose>
+
+					<xsl:when
+						test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0">
+						<xsl:message>
+							Error: Impossible URL in Schematron include
+						</xsl:message>
+					</xsl:when>
+
+					<!-- this case is when there is in embedded schema in the same document elsewhere -->
+					<xslt:when
+						test="string-length( $document-uri ) = 0">
+						<xslt:apply-templates mode="dsdl:go"
+							select="//iso:*[@xml:id= $fragment-id ]/* 
+              	 |id( $fragment-id)/*
+              	 | //iso:*[@id= $fragment-id ]/*" />
+					</xslt:when>
+
+					<!-- case where there is a fragment in another document (should be an iso: element) -->
+					<!-- There are three cases for includes with fragment:
+						0) No href file or no matching id - error!
+						1) REMOVED
+						
+						2) REMOVED
+						
+						3) Otherwise, include the pointed-to element
+					-->
+
+					<xsl:when
+						test="string-length( $fragment-id ) &gt; 0">
+						<xsl:variable name="theDocument_1"
+							select="document( $document-uri,/ )" />
+						<xsl:variable name="originalParent" select=".." />
+
+						<!-- case 0 -->
+						<xsl:if test="not($theDocument_1)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to open referenced included file: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+						<!-- use for-each to rebase id() to external document -->
+						<xsl:for-each select="$theDocument_1">
+							<xsl:variable name="theFragment_1"
+								select=" $theDocument_1//iso:*[@xml:id= $fragment-id ] |
+              	 		id($fragment-id) |
+              			$theDocument_1//iso:*[@id= $fragment-id ]" />
+
+
+							<xsl:choose>
+								<!-- case 0 -->
+								<xsl:when test="not($theFragment_1)">
+									<xsl:message terminate="no">
+										<xsl:text>Unable to locate id attribute: </xsl:text>
+										<xsl:value-of select="@href" />
+									</xsl:message>
+								</xsl:when>
+
+
+								<!-- case 1 REMOVED -->
+
+								<!-- case 2 REMOVED -->
+
+
+								<!-- case 3 -->
+								<xsl:otherwise>
+
+									<xsl:apply-templates
+										select=" $theFragment_1[1]/*" mode="dsdl:go" />
+								</xsl:otherwise>
+							</xsl:choose>
+						</xsl:for-each>
+					</xsl:when>
+
+					<!-- Case where there is no ID so we include the whole document -->
+					<!-- Experimental addition: include fragments of children -->
+					<xsl:otherwise>
+						<xsl:variable name="theDocument_2"
+							select="document( $document-uri,/ )" />
+						<xsl:variable name="theFragment_2"
+							select="$theDocument_2/iso:*" />
+						<xsl:variable name="theContainedFragments"
+							select="$theDocument_2/*/iso:* | $theDocument_2/*/xsl:* | $theDocument_2/*/xhtml:*" />
+						<xsl:if test="not($theDocument_2)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to open referenced included file: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+
+						<!-- There are three cases for includes:
+							0) No text specified- error!
+							
+							1) REMOVED
+							
+							2) REMOVED
+							
+							3) Otherwise, include the pointed-to element
+						-->
+						<xsl:choose>
+							<!-- case 0 -->
+							<xsl:when
+								test="not($theFragment_2) and not ($theContainedFragments)">
+								<xsl:message terminate="no">
+									<xsl:text>Unable to locate id attribute: </xsl:text>
+									<xsl:value-of select="@href" />
+								</xsl:message>
+							</xsl:when>
+
+							<!-- case 1 removed -->
+
+							<!-- case 2 removed -->
+
+							<!-- If this were XLST 2, we could use  
+								if ($theFragment) then $theFragment else $theContainedFragments
+								here (thanks to KN)
+							-->
+							<!-- case 3 -->
+							<xsl:otherwise>
+								<xsl:apply-templates
+									select="$theFragment_2/* " mode="dsdl:go" />
+							</xsl:otherwise>
+						</xsl:choose>
+					</xsl:otherwise>
+				</xsl:choose>
+			</xsl:otherwise>
+		</xsl:choose>
+
+		<xsl:processing-instruction name="DSDL_INCLUDE_END">
+			<xsl:value-of select="@href" />
+		</xsl:processing-instruction>
+	</xsl:template>
+
+
+
+	<!-- =========================================================== -->
+	<!-- Handle Schematron 1.6 inclusions: clone of ISO code above   -->
+	<!-- =========================================================== -->
+
+
+	<!-- Extend the URI syntax to allow # references -->
+	<!-- Add experimental support for simple containers like  /xxx:xxx/schold:pattern to allow better includes -->
+	<xsl:template match="schold:include" mode="dsdl:go">
+		<xsl:variable name="document-uri"
+			select="substring-before(concat(@href,'#'), '#')" />
+		<xsl:variable name="fragment-id"
+			select="substring-after(@href, '#')" />
+
+		<xsl:processing-instruction name="DSDL_INCLUDE_START">
+			<xsl:value-of select="@href" />
+		</xsl:processing-instruction>
+
+		<xsl:choose>
+			<xsl:when test="not( $include-schematron = 'true' )">
+				<xslt:copy>
+					<xslt:copy-of select="@*" />
+					<xslt:apply-templates mode="dsdl:go" />
+				</xslt:copy>
+			</xsl:when>
+			<xsl:otherwise>
+				<xsl:choose>
+
+					<xsl:when
+						test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0">
+						<xsl:message>
+							Error: Impossible URL in Schematron include
+						</xsl:message>
+					</xsl:when>
+
+					<!-- this case is when there is in embedded schema in the same document elsewhere -->
+					<xslt:when
+						test="string-length( $document-uri ) = 0">
+						<xslt:apply-templates mode="dsdl:go"
+							select="//schold:*[@xml:id= $fragment-id ] 
+              	 |id( $fragment-id)
+              	 | //schold:*[@id= $fragment-id ]" />
+					</xslt:when>
+
+					<!-- case where there is a fragment in another document (should be an iso: element) -->
+					<xsl:when
+						test="string-length( $fragment-id ) &gt; 0">
+						<xsl:variable name="theDocument_1"
+							select="document( $document-uri,/ )" />
+						<xsl:if test="not($theDocument_1)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to open referenced included file: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+						<!-- use for-each to rebase id() to $theDocument -->
+						<xsl:for-each select="$theDocument_1">
+							<xsl:variable name="theFragment_1"
+								select=" $theDocument_1//schold:*[@xml:id= $fragment-id ] |
+              	id($fragment-id) |
+              	$theDocument_1//schold:*[@id= $fragment-id ]" />
+							<xsl:if
+								test=" $theFragment_1/self::schold:schema ">
+								<xsl:message>
+									Schema error: Use include to include
+									fragments, not a whole schema
+								</xsl:message>
+							</xsl:if>
+							<xsl:if test="not($theFragment_1)">
+								<xsl:message terminate="no">
+									<xsl:text>Unable to locate id attribute: </xsl:text>
+									<xsl:value-of select="@href" />
+								</xsl:message>
+							</xsl:if>
+							<xsl:apply-templates
+								select=" $theFragment_1[1]" mode="dsdl:go" />
+						</xsl:for-each>
+					</xsl:when>
+
+					<!-- Case where there is no ID so we include the whole document -->
+					<!-- Experimental addition: include fragments of children -->
+					<xsl:otherwise>
+						<xsl:variable name="theDocument_2"
+							select="document( $document-uri,/ )" />
+						<xsl:variable name="theFragment_2"
+							select="$theDocument_2/iso:*" />
+						<xsl:variable name="theContainedFragments"
+							select="$theDocument_2/*/schold:* | $theDocument_2/*/xsl:* | $theDocument_2/*/xhtml:*" />
+						<xsl:if test="not($theDocument_2)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to open referenced included file: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+
+						<xsl:if
+							test=" $theFragment_2/self::schold:schema or $theContainedFragments/self::schold:schema">
+							<xsl:message>
+								Schema error: Use include to include
+								fragments, not a whole schema
+							</xsl:message>
+						</xsl:if>
+						<xsl:if
+							test="not($theFragment_2) and not ($theContainedFragments)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to locate id attribute: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+						<!-- If this were XLST 2, we could use  
+							if ($theFragment) then $theFragment else $theContainedFragments
+							here (thanks to KN)
+						-->
+						<xsl:choose>
+							<xsl:when test=" $theFragment_2 ">
+								<xsl:apply-templates
+									select="$theFragment_2 " mode="dsdl:go" />
+							</xsl:when>
+							<xsl:otherwise>
+								<!-- WARNING!  EXPERIMENTAL! Use at your own risk. This may be discontinued! -->
+								<xsl:apply-templates
+									select="  $theContainedFragments " mode="dsdl:go" />
+							</xsl:otherwise>
+						</xsl:choose>
+					</xsl:otherwise>
+				</xsl:choose>
+
+			</xsl:otherwise>
+		</xsl:choose>
+
+		<xsl:processing-instruction name="DSDL_INCLUDE_END">
+			<xsl:value-of select="@href" />
+		</xsl:processing-instruction>
+	</xsl:template>
+	<!-- =========================================================== -->
+	<!-- ISO/IEC 19757 - DSDL Document Schema Definition Languages   -->
+	<!-- Part 5 - DataType Library Language - DTLL                   -->
+	<!-- Committee Draft  Experimental support only                  -->
+	<!-- The <include> element may well be replaced by XInclude in   -->
+	<!-- any final version.                                          -->
+	<!-- =========================================================== -->
+	<xslt:template match="dtll:include" mode="dsdl:go">
+		<!-- Insert subschema -->
+
+		<xsl:variable name="document-uri"
+			select="substring-before(concat(@href,'#'), '#')" />
+		<xsl:variable name="fragment-id"
+			select="substring-after(@href, '#')" />
+		<xsl:processing-instruction name="DSDL_INCLUDE_START">
+			<xsl:value-of select="@href" />
+		</xsl:processing-instruction>
+		<xsl:choose>
+			<xsl:when test="not( $include-dtll = 'true' )">
+				<xslt:copy>
+					<xslt:copy-of select="@*" />
+					<xslt:apply-templates mode="dsdl:go" />
+				</xslt:copy>
+			</xsl:when>
+			<xsl:otherwise>
+				<xsl:choose>
+
+					<xsl:when
+						test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0">
+						<xsl:message>
+							Error: Impossible URL in DTLL include
+						</xsl:message>
+					</xsl:when>
+
+					<!-- this case is when there is in embedded schema in the same document elsewhere -->
+					<xslt:when
+						test="string-length( $document-uri ) = 0">
+						<xslt:apply-templates mode="dsdl:go"
+							select="//*[@xml:id= $fragment-id ] | id( $fragment-id) 
+              	| //*[@id= $fragment-id ]" />
+					</xslt:when>
+
+					<xsl:when
+						test="string-length( $fragment-id ) &gt; 0">
+						<xsl:variable name="theDocument_1"
+							select="document( $document-uri,/ )" />
+						<xsl:if test="not($theDocument_1)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to open referenced included file: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+						<!-- use for-each to rebase id() to $theDocument -->
+						<xsl:for-each select="$theDocument_1">
+							<xsl:variable name="theFragment_1"
+								select="$theDocument_1//*[@xml:id= $fragment-id ]
+               | id( $fragment-id ) 
+               | $theDocument_1//*[@id= $fragment-id ]" />
+							<xsl:if test="not($theFragment_1)">
+								<xsl:message terminate="no">
+									<xsl:text>Unable to locate id attribute: </xsl:text>
+									<xsl:value-of select="@href" />
+								</xsl:message>
+							</xsl:if>
+							<xsl:apply-templates
+								select=" $theFragment_1[1]" mode="dsdl:go" />
+						</xsl:for-each>
+					</xsl:when>
+
+					<xsl:otherwise>
+						<xsl:variable name="theDocument_2"
+							select="document( $document-uri,/ )" />
+						<xsl:variable name="theFragment_2"
+							select="$theDocument_2/*" />
+
+						<xsl:if test="not($theDocument_2)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to open referenced included file: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+
+						<xsl:if test="not($theFragment_2)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to locate id attribute: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+						<xsl:apply-templates select="$theFragment_2 "
+							mode="dsdl:go" />
+					</xsl:otherwise>
+				</xsl:choose>
+
+			</xsl:otherwise>
+		</xsl:choose>
+		<xsl:processing-instruction name="DSDL_INCLUDE_END">
+			<xsl:value-of select="@href" />
+		</xsl:processing-instruction>
+	</xslt:template>
+
+	<!-- =========================================================== -->
+	<!-- ISO/IEC 19757 - DSDL Document Schema Definition Languages   -->
+	<!-- Part 7 - Character Repertoire Description Language - CRDL   -->
+	<!-- Final Committee Draft 2008-01-11 Experimental support only  -->
+	<!-- =========================================================== -->
+	<xslt:template match="crdl:ref" mode="dsdl:go">
+		<!-- Insert subschema -->
+
+		<xsl:variable name="document-uri"
+			select="substring-before(concat(@href,'#'), '#')" />
+		<xsl:variable name="fragment-id"
+			select="substring-after(@href, '#')" />
+		<xsl:processing-instruction name="DSDL_INCLUDE_START">
+			<xsl:value-of select="@href" />
+		</xsl:processing-instruction>
+		<xsl:choose>
+			<xsl:when test="not( $include-crdl = 'true' )">
+				<xslt:copy>
+					<xslt:copy-of select="@*" />
+					<xslt:apply-templates mode="dsdl:go" />
+				</xslt:copy>
+			</xsl:when>
+			<xsl:otherwise>
+				<xsl:choose>
+
+					<xsl:when
+						test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0">
+						<xsl:message>
+							Error: Impossible URL in CRDL include
+						</xsl:message>
+					</xsl:when>
+
+					<!-- this case is when there is in embedded schema in the same document elsewhere -->
+					<xslt:when
+						test="string-length( $document-uri ) = 0">
+
+						<xslt:apply-templates mode="dsdl:go"
+							select="//*[@xml:id= $fragment-id ] | id( $fragment-id)
+              	| //*[@id= $fragment-id ]" />
+					</xslt:when>
+
+					<xsl:when
+						test="string-length( $fragment-id ) &gt; 0">
+						<xsl:variable name="theDocument_1"
+							select="document( $document-uri,/ )" />
+						<xsl:if test="not($theDocument_1)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to open referenced included file: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+						<!-- use for-each to rebase id() to $theDocument -->
+						<xsl:for-each select="$theDocument_1">
+							<xsl:variable name="theFragment_1"
+								select="$theDocument_1//*[@xml:id= $fragment-id ]
+               | id( $fragment-id )
+               | $theDocument_1//*[@id= $fragment-id ]" />
+
+							<xsl:if test="not($theFragment_1)">
+								<xsl:message terminate="no">
+									<xsl:text>Unable to locate id attribute: </xsl:text>
+									<xsl:value-of select="@href" />
+								</xsl:message>
+							</xsl:if>
+							<xsl:apply-templates select=" $theFragment_1 "
+								mode="dsdl:go" />
+						</xsl:for-each>
+					</xsl:when>
+
+					<xsl:otherwise>
+						<xsl:variable name="theDocument_2"
+							select="document( $document-uri,/ )" />
+						<xsl:variable name="theFragment_2"
+							select="$theDocument_2/*" />
+
+						<xsl:if test="not($theDocument_2)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to open referenced included file: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+						<xsl:if test="not($theFragment_2)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to locate id attribute: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+
+						<xsl:apply-templates select="$theFragment_2"
+							mode="dsdl:go" />
+					</xsl:otherwise>
+				</xsl:choose>
+
+			</xsl:otherwise>
+		</xsl:choose>
+		<xsl:processing-instruction name="DSDL_INCLUDE_END">
+			<xsl:value-of select="@href" />
+		</xsl:processing-instruction>
+	</xslt:template>
+
+
+	<!-- =========================================================== -->
+	<!-- ISO/IEC 19757 - DSDL Document Schema Definition Languages   -->
+	<!-- Part 4 - Namespace-based Validation Dispatching Language - NVDL -->
+	<!-- Note: This does not include schemas referenced for          -->
+	<!-- validation, it merely handles any simple XIncludes          -->
+	<!-- =========================================================== -->
+	<!-- ISO/IEC 19757 - DSDL Document Schema Definition Languages   -->
+	<!-- Part 8 - Document Schema Renaming Language - DSRL           -->
+	<!-- Note: Final? Committee Draft   Experimental support only    -->
+	<!-- =========================================================== -->
+	<!-- XInclude support for id based references only, with 1 level -->
+	<!-- of fallback.                                                -->
+	<!-- =========================================================== -->
+
+	<xslt:template mode="dsdl:go"
+		match="xi:include[@href][not(@parseType) or @parseType ='xml']">
+		<!-- Simple inclusions only here -->
+		<xsl:processing-instruction name="DSDL_INCLUDE_START">
+			<xsl:value-of select="@href" />
+		</xsl:processing-instruction>
+		<xsl:choose>
+			<xsl:when test="not( $include-xinclude = 'true' )">
+				<xslt:copy>
+					<xslt:copy-of select="@*" />
+					<xslt:apply-templates mode="dsdl:go" />
+				</xslt:copy>
+			</xsl:when>
+			<xsl:otherwise>
+				<xsl:choose>
+
+					<xsl:when test="contains( @href, '#')">
+						<xsl:message terminate="yes">
+							Fatal error: Xinclude href contains fragment
+							identifier #
+						</xsl:message>
+					</xsl:when>
+
+
+					<xsl:when test="contains( @xpointer, '(')">
+						<xsl:message terminate="yes">
+							Fatal error: Sorry, this software only
+							supports simple ids in XInclude xpointers
+						</xsl:message>
+					</xsl:when>
+
+					<xsl:when
+						test="string-length( @href ) = 0 and string-length( @xpointer ) = 0">
+
+						<xsl:message terminate="yes">
+							Fatal Error: Impossible URL in XInclude
+							include
+						</xsl:message>
+					</xsl:when>
+
+					<!-- this case is when there is in embedded schema in the same document elsewhere -->
+					<xslt:when test="string-length( @href ) = 0">
+
+						<xslt:apply-templates mode="dsdl:go"
+							select="//*[@xml:id= current()/@xpointer  ] | id( @xpointer)
+              	| //*[@id= current()/@xpointer  ]" />
+					</xslt:when>
+
+					<xsl:when
+						test="string-length( @xpointer ) &gt; 0">
+						<xsl:variable name="theDocument_1"
+							select="document( @href,/ )" />
+						<xsl:variable name="theFragment_1"
+							select="$theDocument_1//*[@xml:id= current()/@xpointer  ]
+             
+              | $theDocument_1//*[@id= current()/@xpointer  ]" />
+						<!-- removed
+							| $theDocument_1/id( @xpointer)
+							because it requires rebasing in XSLT1 and that would mess up the use of current()
+						-->
+
+
+						<!-- Allow one level of fallback, to another XInclude -->
+						<xsl:if test="not($theDocument_1)">
+							<xsl:choose>
+								<xsl:when test="xi:fallback">
+									<xsl:variable name="theDocument_2"
+										select="document( xi:fallback[1]/xi:include[not(@parseType)
+                    	 or @parseType='xml']/@href,/ )" />
+									<xsl:variable name="theFragment_2"
+										select="$theDocument_2//*[@xml:id= current()/xi:fallback[1]/xi:include/@xpointer  ]
+              				| $theDocument_2//*[@id= current()/xi:fallback[1]/xi:include/@xpointer  ]" />
+									<!-- removed 
+										| $theDocument_2/id( xi:fallback[1]/xi:include/@xpointer)
+										because it id() would need rebasing in XSLT1 and that would mess up use of current()
+									-->
+
+									<xsl:if
+										test="not($theDocument_2)">
+
+										<xsl:message terminate="no">
+											<xsl:text>Unable to open referenced included file and fallback
+									file: </xsl:text>
+											<xsl:value-of
+												select="@href" />
+										</xsl:message>
+									</xsl:if>
+								</xsl:when>
+								<xsl:otherwise>
+									<xsl:message terminate="no">
+										<xsl:text>Unable to open referenced included file: </xsl:text>
+										<xsl:value-of select="@href" />
+									</xsl:message>
+								</xsl:otherwise>
+							</xsl:choose>
+						</xsl:if>
+						<xsl:apply-templates select=" $theFragment_1"
+							mode="dsdl:go" />
+					</xsl:when>
+
+					<!-- Document but no fragment specified -->
+					<xsl:otherwise>
+						<xsl:variable name="theDocument_3"
+							select="document( @href,/ )" />
+						<xsl:variable name="theFragment_3"
+							select="$theDocument_3/*" />
+
+						<xsl:if test="not($theDocument_3)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to open referenced included file: </xsl:text>
+								<xsl:value-of select="@href" />
+							</xsl:message>
+						</xsl:if>
+
+						<xsl:apply-templates select="$theFragment_3 "
+							mode="dsdl:go" />
+					</xsl:otherwise>
+				</xsl:choose>
+
+			</xsl:otherwise>
+		</xsl:choose>
+		<xsl:processing-instruction name="DSDL_INCLUDE_END">
+			<xsl:value-of select="@href" />
+		</xsl:processing-instruction>
+	</xslt:template>
+
+	<!-- =========================================================== -->
+	<!-- W3C XLink 1.1 embedded simple links                        -->
+	<!-- =========================================================== -->
+	<xslt:template
+		match="*[@xlink:href][not(parent::*[@xlink:type='complex'])]
+	           [not(@xlink:type) or (@xlink:type='simple')]
+	           [@xlink:show='embed']
+	           [not(@xlink:actuate) or (@xlink:actuate='onLoad')]"
+		mode="dsdl:go" priority="1">
+
+		<xsl:variable name="document-uri"
+			select="substring-before(concat(@xlink:href,'#'), '#')" />
+		<xsl:variable name="fragment-id"
+			select="substring-after(@xlink:href, '#')" />
+		<xsl:processing-instruction name="DSDL_INCLUDE_START">
+			<xsl:value-of select="@xlink:href" />
+		</xsl:processing-instruction>
+		<xsl:choose>
+			<xsl:when test="not( $include-xlink = 'true' )">
+				<xslt:copy>
+					<xslt:copy-of select="@*" />
+					<xslt:apply-templates mode="dsdl:go" />
+				</xslt:copy>
+			</xsl:when>
+			<xsl:otherwise>
+				<xsl:choose>
+
+					<xsl:when
+						test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0">
+						<xsl:message>
+							Error: Impossible URL in XLink embedding
+							link
+						</xsl:message>
+					</xsl:when>
+
+					<!-- this case is when there is in embedded schema in the same document elsewhere -->
+					<xslt:when
+						test="string-length( $document-uri ) = 0">
+						<xslt:apply-templates mode="dsdl:go"
+							select="//*[@xml:id= $fragment-id ] | id( $fragment-id) 
+              	| //*[@id= $fragment-id ]" />
+					</xslt:when>
+
+					<xsl:when
+						test="string-length( $fragment-id ) &gt; 0">
+						<xsl:variable name="theDocument_1"
+							select="document( $document-uri,/ )" />
+						<xsl:if test="not($theDocument_1)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to open referenced included file: </xsl:text>
+								<xsl:value-of select="@xlink:href" />
+							</xsl:message>
+						</xsl:if>
+						<!-- use for-each to rebase id() to $theDocument -->
+						<xsl:for-each select="$theDocument_1">
+							<xsl:variable name="theFragment_1"
+								select="$theDocument_1//*[@xml:id= $fragment-id ]
+               | id( $fragment-id ) 
+               | $theDocument_1//*[@id= $fragment-id ]" />
+							<xsl:if test="not($theFragment_1)">
+								<xsl:message terminate="no">
+									<xsl:text>Unable to locate id attribute: </xsl:text>
+									<xsl:value-of select="@xlink:href" />
+								</xsl:message>
+							</xsl:if>
+							<xsl:apply-templates
+								select=" $theFragment_1[1]" mode="dsdl:go" />
+						</xsl:for-each>
+					</xsl:when>
+
+					<xsl:otherwise>
+						<xsl:variable name="theDocument_2"
+							select="document( $document-uri,/ )" />
+						<xsl:variable name="theFragment_2"
+							select="$theDocument_2/*" />
+
+						<xsl:if test="not($theDocument_2)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to open referenced included file: </xsl:text>
+								<xsl:value-of select="@xlink:href" />
+							</xsl:message>
+						</xsl:if>
+
+						<xsl:if test="not($theFragment_2)">
+							<xsl:message terminate="no">
+								<xsl:text>Unable to locate id attribute: </xsl:text>
+								<xsl:value-of select="@xlink:href" />
+							</xsl:message>
+						</xsl:if>
+						<xsl:apply-templates select="$theFragment_2 "
+							mode="dsdl:go" />
+					</xsl:otherwise>
+				</xsl:choose>
+
+			</xsl:otherwise>
+		</xsl:choose>
+
+		<xsl:processing-instruction name="DSDL_INCLUDE_END">
+			<xsl:value-of select="@xlink:href" />
+		</xsl:processing-instruction>
+	</xslt:template>
+
+
+</xslt:stylesheet>
\ No newline at end of file
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl
new file mode 100644
index 0000000000000000000000000000000000000000..d59b8f38fe0bf28bc089b033c2acdd314839b8cc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl
@@ -0,0 +1,55 @@
+<?xml version="1.0" ?><?xar XSLT?>
+<!-- Implmentation for the Schematron XML Schema Language.
+	http://www.ascc.net/xml/resource/schematron/schematron.html
+ 
+ Copyright (c) 2000,2001 Rick Jelliffe and Academia Sinica Computing Center, Taiwan
+
+ This software is provided 'as-is', without any express or implied warranty. 
+ In no event will the authors be held liable for any damages arising from 
+ the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose, 
+ including commercial applications, and to alter it and redistribute it freely,
+ subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not claim
+ that you wrote the original software. If you use this software in a product, 
+ an acknowledgment in the product documentation would be appreciated but is 
+ not required.
+
+ 2. Altered source versions must be plainly marked as such, and must not be 
+ misrepresented as being the original software.
+
+ 3. This notice may not be removed or altered from any source distribution.
+-->
+
+<!-- Schematron message -->
+
+<xsl:stylesheet
+   version="1.0"
+   xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+   xmlns:axsl="http://www.w3.org/1999/XSL/TransformAlias">
+
+<xsl:import href="iso_schematron_skeleton_for_xslt1.xsl"/>
+
+<xsl:template name="process-prolog">
+   <axsl:output method="text" />
+</xsl:template>
+
+<!-- use default rule for process-root:  copy contens / ignore title -->
+<!-- use default rule for process-pattern: ignore name and see -->
+<!-- use default rule for process-name:  output name -->
+<!-- use default rule for process-assert and process-report:
+     call process-message -->
+
+<xsl:template name="process-message">
+   <xsl:param name="pattern" />
+   <xsl:param name="role" />
+   <axsl:message>
+      <xsl:apply-templates mode="text"  
+      /> (<xsl:value-of select="$pattern" />
+      <xsl:if test="$role"> / <xsl:value-of select="$role" />
+      </xsl:if>)</axsl:message>
+</xsl:template>
+
+</xsl:stylesheet>
\ No newline at end of file
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl
new file mode 100644
index 0000000000000000000000000000000000000000..b0e7175cfff34fb05d631622c9429f0adcda8d5d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl
@@ -0,0 +1,1796 @@
+<?xml version="1.0"?><?xar XSLT?>
+
+<!-- 
+   OVERVIEW
+   
+   ASCC/Schematron.com Skeleton Module for ISO Schematron (for XSLT1 systems)
+   
+   ISO Schematron is a language for making assertion about the presence or absence
+   of patterns in XML documents. It is typically used for as a schema language, or
+   to augment existing schema languages, and to check business rules. It is very
+   powerful, yet quite simple: a developer only need know XPath and about five other
+   elements.
+   
+   This is an open source implementation of ISO Schematron in XSLT. Although ISO does
+   not allow reference implementations which might compete with the text of the
+   standard, this code has been compiled by Rick Jelliffe, inventor of Schematron
+   and editor of the ISO standard; so developers can certainly use it as an 
+   unofficial reference implementation for clarification. 
+   
+   This implementation is based on one by Oliver Becker. API documentation is 
+   available separately; try www.schematron.com for this. Funding for this
+   stylesheet over the years has come from Topologi Pty. Ltd., Geotempo Ltd.,
+   and ASCC, Tapei.
+   
+   There are two versions of this skeleton: one is tailored for XSLT1 processors
+   and the other is tailored for XSLT2 processors. Future versions of the
+   XSLT2 skeleton may support more features than that the XSLT 1 skeleton.
+-->
+<!--
+   TIPS
+      
+   A tip for new users of Schematron: make your assertions contain positive messages
+   about what is expected, rather than error messages. For example, use the form
+   "An X should have a Y, because Z". 
+   
+   Another tip is that Schematron provides an
+   element <sch:ns> for declaring the namespaces and prefixes used in Xpaths in 
+   attribute values; it does not extend the XML Namespaces mechanism: if a name
+   in an XPath has a prefix, there must be an <sch:ns> element for that prefix; if
+   a name in an XPath does not have a prefix, it is always in no namespace.
+   
+   A tip for implementers of Schematron, either using this API or re-implementing it:
+   make the value of the diagnostics, flags and richer features available if possible;
+   Schematron has many of the optional richer features which, if implemented, provide
+   a compelling alternative approach to validation and business-rules checking compared
+   to other schema languages and programs. 
+   
+   If you create your own meta-stylesheet to override this one, it is a
+   good idea to have both in the same directory and to run the stylesheet
+   from that directory, as many XSLT implementations have ideosyncratic
+   handling of URLs: keep it simple.
+-->
+ 
+
+<!--
+  INVOCATION INFORMATION
+  
+  The following parameters are available
+  
+    phase           NMTOKEN | "#ALL" (default) Select the phase for validation
+    allow-foreign   "true" | "false" (default)   Pass non-Schematron elements to the generated stylesheet
+    sch.exslt.imports semi-colon delimited string of filenames for some EXSLT implementations  
+    message-newline "true" (default) | "false"   Generate an extra newline at the end of messages
+    optimize        "visit-no-attributes"     
+    debug	    "true" | "false" (default)  Debug mode lets compilation continue despite problems
+    attributes "true" | "false"  (Autodetecting) Use only when the schema has no attributes as the context nodes
+    only-child-elements "true" | "false" (Autodetecting) Use only when the schema has no comments
+    or PI  as the context nodes
+    
+  The following parameters can be specified as Schematron variables in diagnostics, assertions and so on.
+    fileNameParameter string	  
+    fileDirParameter string				
+    archiveNameParameter string	  In case of ZIP files
+    archiveDirParameter string	  In case of ZIP files	
+    output-encoding				  Use when outputting to XML
+ 
+ Experimental: USE AT YOUR OWN RISK   
+    visit-text "true" "false"   Also visist text nodes for context. WARNING: NON_STARDARD.
+    select-contents '' | 'key' | '//'   Select different implementation strategies
+ 
+ Conventions: Meta-stylesheets that override this may use the following parameters
+    generate-paths=true|false   generate the @location attribute with XPaths
+    diagnose= yes | no    Add the diagnostics to the assertion test in reports
+    terminate= yes | no   Terminate on the first failed assertion or successful report
+-->
+
+<!-- 
+  XSLT VERSION SUPPORT
+
+  XSLT 1:
+     A schema using the standard XSLT 1 query binding will have a /schema/@queryBinding='xslt' or 
+     nothing.
+
+       * Note: XT does not implement key() and will die if given it. 
+       * Add all formal parameters to default templates
+       * Fix missing apply-templates from process-ns and add params back
+
+  EXSLT:  Experimental support
+     A schema using the EXSLT query binding will have a /schema/@queryBinding='exslt'.
+     It is built on XSLT 1. After experience is gained, this binding is expected to be 
+     formalized as part of ISO Schematron, which currently reserves the "exslt" name for this purpose.
+
+     Some EXSLT engines have the extra functions built-in. For these, there is no need to
+     provide library locations. For engines that require the functions, either hard code
+     them in this script or provide them on the command-line argument.
+ 
+-->
+<!--
+   PROCESS INFORMATION
+   
+   This stylesheet compiles a Schematron schema (*.sch) into XSLT code (*.xsl). 
+   The generated XSLT code can then be run against an XML file (*.xml, etc) and
+   will produce validation results.
+   
+   The output of validation results is performed using named templates (process-*). 
+   These can be overridden easily by making a new XSLT stylesheet that imports this 
+   stylesheet but has its own version of the relevant process-* templates. Several
+   of these invoking stylesheets are available: "iso_svrl.xsl", for example generates
+   ISO Schematron Validation Report Language format results.
+   
+   In this version of the stylesheet, the ISO feature called "abstract patterns" is
+   implemented using macro processing: a prior XSLT stage to which converts uses
+   of abstract patterns into normal patterns. If you do not use abstract patterns,
+   it is not necessary to preprocess the schema.
+   
+   To summarize, a basic process flow for some commandline processor is like this:
+     XSLT -input=xxx.sch  -output=xxx.xsl  -stylesheet=iso_schematron_skeleton.xsl
+     XSLT -input=document.xml  -output=xxx-document.results  -stylesheet=xxx.xsl
+   
+   iso_svrl.xslt is an implementation of Schematron that can use this skeleton and
+   generate ISO SVRL reports. A process flow for some commandline processor would
+   be like this:
+     XSLT -input=xxx.sch  -output=xxx.xsl  -stylesheet=iso_svrl.xsl
+     XSLT -input=document.xml  -output=xxx-document.results  -stylesheet=xxx.xsl
+     
+   It is not impossible that ultimately a third stage, to handle macro-preprocessing
+   and inclusion, might be necessary. (The trade-off is in making this XSLT more
+   complex compared to making the outer process more complex.)
+             
+  This version has so far been tested with
+     Saxon 8
+     MSXML 4 (or 6?)   
+
+ Please note that if you are using SAXON and JAXP, then you should use 
+  System.setProperty("javax.xml.transform.TransformerFactory",
+                          "net.sf.saxon.TransformerFactoryImpl");
+ rather than 
+  System.setProperty("javax.xml.xpath.TransformerFactory",
+                           "net.sf.saxon.TransformerFactoryImpl");
+ which is does not work, at least for the versions of SAXON we tried.
+-->
+<!--
+ LEGAL INFORMATION
+ 
+ Copyright (c) 2000-2008 Rick Jelliffe and Academia Sinica Computing Center, Taiwan
+
+ This software is provided 'as-is', without any express or implied warranty. 
+ In no event will the authors be held liable for any damages arising from 
+ the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose, 
+ including commercial applications, and to alter it and redistribute it freely,
+ subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not claim
+ that you wrote the original software. If you use this software in a product, 
+ an acknowledgment in the product documentation would be appreciated but is 
+ not required.
+
+ 2. Altered source versions must be plainly marked as such, and must not be 
+ misrepresented as being the original software.
+
+ 3. This notice may not be removed or altered from any source distribution.
+-->
+<!--
+  NOTE: Compared to the iso_schematron_skeleton_for_saxon.xsl code, this version is currently missing
+     1) localization
+     2) properties
+     3) pattern/@documents
+
+  VERSION INFORMATION 
+   2009-02-25 RJ
+        * Fix up variable names so none are used twice in same template
+        * Tested on SAXON 9, Xalan 2.7.1. Partly tested MSXML.  
+   2008-09-19 RJ
+        * Add mode schematron-select-full-path and param full-path-notation 
+   
+   2008-08-11
+   		* TT report/@flag was missing
+   2008-08-06
+   		* TT Top-level lets need to be implemented using xsl:param not xsl:variable
+   		* TT xsl:param/@select must have XPath or not be specified
+   		
+    Version: 2008-07-28
+   		* KH schematron-get-full-path-3 has [index] even on top step
+   		* RJ fix schematron-get-full-path to have namespace predicate, I don't know why this was removed
+   		
+   Version: 2008-07-24
+   		* RJ clean out commented out namespace handling code
+   		* RJ add support for experimental non-standard attribute report/@action
+   		and assert/@action, and add parameter not in the published API (should
+   		not break anything, it is XSLT1)
+   		* RJ Remove remaining XSLT2 code for ease of reading
+   		
+   Version: 2008-07-14 minor update for inclusion experiments
+   	* RJ Clean up zero-length fragment test on include
+   	* RJ Add experimental support for include containers 
+   	* RJ For path generation, test for //iso:schema not just /iso:schema, for potential embedded Schematron support   
+   	* RJ Don't generate double error messages for old namespace elements
+   	* RJ Experimental iso:rule/iso:title just kept as comment (bigger request Uche Ogbuji)
+   	* RJ Remove spurious debug messages
+   	* RJ Fix bug that prevented including patterns in this (report Roger
+   	Costello)
+  
+   Version: 2007-10-17
+     From this version on I am forking XSLT2 support to a different version of the script.
+     This is due to the increasingly horrible state of the namespace handling code as well
+     as other inconsistencies between the major implementations of different versions.
+     The intent is that future versions of this will have XSLT2 isms removed and be simplified
+     to cope with only XSLT1 and EXLST. Note that though this version is called
+     iso_schematron_skeleton_for_xslt1, the various meta-stylesheets will continue to just call
+     iso_schematron_skeleton: it is up to you to rename the stylesheet to the one you want to
+     use.
+
+       * RJ fix FULL-PATH problem with attribute names
+
+
+   Version: 2007-07-19
+     Accept most changes in David Carlisle's fork, but continue as XSLT1 script: 
+    	http://dpcarlisle.blogspot.com/search/label/schematron
+    	* DPC Remove "optimize" parameter
+    	* DPC Add autodetecting optimize parameter attribute to skip checking attribute
+    	context
+    	* DPC Add autodetecting optimize parameter only-child-elements turn off checking for 
+    	comments and PIs
+    	* DPC (Experimental: NON_STANDARD DANGER!) Add param visit-text to viist text
+    	nodes too for context 
+    	* DPC Fix inclusion syntax to allow #
+    	* DPC Priorities count up from 1000 not down from 4000 to allow more rules
+        * RJ Add new template for titles of schemas, with existing behaviour.  
+        Override process-schema-title for custom processing of title
+    		
+    
+   Version: 2007-04-04
+   	* RJ debug mode param
+	* RJ alter mixed test to only test mixed branches, so the same document
+	could have old and new namespaces schemas in it, but each schema must
+	be distinct, just so as not to overconstrain things.
+   	* KH zero-length include/@href is fatal error, but allow debug mode
+	* SB add hint on SAXON and JAXP
+	* DC generate-full-path-1 generates XLST1 code by default
+   Version: 2007-03-05
+      	* AS Typo for EXSLT randome, improve comment
+      	* KH get-schematron-full-path-2 needs to apply to attributes too
+      	* DP document policy on extensions better
+      	* DC use copy-of not copy for foreign elements
+      	* DC add generate-path-2
+      	* DC don't try to apply templates to attribute axis on attribute nodes, to
+      	stop SAXON warning.
+      	* RJ improve reporting of typos 
+   
+   Version: 2007-02-08
+   		* KH Schematron fullpath implementation: @* handled twice and / missing
+   		* KH Change stylesheetbody from named template to mode to allow implementers more flexibility.
+   		  Move process-ns to outside the stylesheet body.
+   		* DP, FG, fix handling of xslt:key
+   		* FG no iso:title/@class
+   		* Experimental optimization 'visit-no-attributes'
+   		* KH Experimental added schematron-get-full-path-2 which gives prefixed version for humans
+ 		* DC Move stylesheet/@version generation to after namespace handling
+ 		* DC, FG EXSLT namespace handling code
+ 		* FG add ref and commented code from FG's page on namespaces
+ 		* Start adding normalize-space() to parameter code
+ 		* Add a space between diagnostics
+   		   		 
+   Version: 2007-01-22
+   	* DP change = ($start) to = $start and =($phase) to =$phase 
+   	to run under Saxon 8.8j
+	* FG better title section using ( @id | sch:title)[last()]
+	* Default query language binding is "xslt" not "xslt1"
+  
+   Version: 2007-01-19
+   		* Simplify message newline code
+   		* Remove termination and xpath appending to message options: 
+   		   factor out as  iso_schematron_terminator.xsl
+   		* Comment out XSLT2 namespace fix temporarily
+  
+   Version: 2007-01-18 (First beta candidate for comment)
+          * DC remove xml:space="preserve"
+          * FG improve comment on import statement
+          * DC improve comments on invocation section
+          * Add exploratory support for sch:schema[@queryBinding='xpath']
+             by allowing it and warning as lets are found
+          * Be strict about queryBinding spelling errors
+          * Extra comments on the different queryBindings
+          * KH Add option "message-paths" to generate XPath from output 
+          * KH Add option "terminate" to halt with an error after the first assertion
+          * KH refactor paths in schematron-full-path
+          * Improve (?) namespace handling: no dummy attributes for prefix "xsl" generated
+   
+   Version: 2007-01-15
+          * FG fix for calling templates
+          * Add formal parameters to default templates: may help XSLT 2
+          * Fix get-schematron-full-path
+          * Include skeleton1-6 is commented out by default
+
+   Version:2007-01-12 (Pre-beta release to Schematron-love-in maillist)
+           * Add many extra parameters to the process-* calls, so that almost
+           all the information in the schema can be provided to client programs.
+           Also, rearrange the parameters to fit in with the ISO schema, which
+           has "rich" and "linkable" attribute groups.
+           * Warn on diagnostics with no ID once only
+           * Improved path reporting, to handle for namespaces
+           * Add process-title dummy template for API
+           * Add command-line parameter allow-foreign (true|false) to suppress
+            warnings one foreign elements and pass them through to the generated
+            stylesheet
+           * remove legacy templates for the old ASCC namespace and no namespace, 
+              and use an import statement instead. Much cleaner now!
+           * patterns use @id not @name
+           * titles can contain sub-elements
+           * start change sch:rule to allow attributes, PIs and comments 
+           * the default process-* for inline elements add a leading and trailing 
+             space, to reduce the chance of concatenation.
+           * add comments to make the generated code clearer
+           
+   Version:2006-11-07 (ISO: first release private to schematron-love-in maillist for review)
+           * Duplicate pattern templates, for handling ISO namespace
+           * Add priority onto default and paragraph templates
+           * Add namespace checks
+           * Handle key in xsl namespace not iso
+           * Add include
+           * Improve namespace handling
+           * Preliminary XSLT2 and EXSLT support
+	       * Refactor iso:schema for clarity
+
+    Version: 2003-05-26 
+    	    * Fix bug with key 
+    Version: 2003-04-16
+    	   * handle 1.6 let expressions
+    	   * make key use XSLT names, and allow anywhere
+    Version: 2001-06-13
+           * same skeleton now supports namespace or no namespace
+           * parameters to handlers updated for all 1.5 attributes 
+           * diagnostic hints supported: command-line option diagnose=yes|no
+           * phases supported: command-line option phase=#ALL|...
+           * abstract rules
+           * compile-time error messages  
+	   * add utility routine generate-id-from-path
+          
+    Contributors: Rick Jelliffe (original), Oliver Becker (architecture, XSLT2), 
+             Miloslav Nic (diagnostic, phase, options), Ludwig Svenonius (abstract)
+             Uche Ogbuji (misc. bug fixes), Jim Ancona (SAXON workaround),
+	 	     Francis Norton (generate-id-from-path), Robert Leftwich, Bryan Rasmussen,
+             Dave Pawson (include, fallback), Florent Georges (namespaces, exslt, attribute
+             context), Benoit Maisonny (attribute context), John Dumps (process-message newline),
+             Cliff Stanford (diagnostics and other newlines)
+
+    
+    KNOWN TYPICAL LIMITATIONS:
+      * Don't use <sch:ns prefix="xsl" .../> with a namespace other than the standard
+      XSLT one. This would be a bizarre thing to do anyway. 
+      * Don't use other prefixes for the XSLT namespace either; some implementations will
+      not handle it correctly.
+     
+     EXTENSIONS:
+      ISO Schematron is designed as a framework with some standard query language
+      bindings. If you need to support other features, please do so safely by making
+      up your own @queryLanguage name: this makes it clear that your schema requires
+      special features. For example, default ISO Schematron does not support user
+      defined functions; so if you want to use the user defined function feature
+      in XSLT, you need to have a schema with some queryBinding attribute name like
+      "XSLT-with-my-functions" or whatever.
+-->
+
+
+
+
+<xsl:stylesheet version="1.0" 
+	xmlns:xsl="http://www.w3.org/1999/XSL/Transform" 
+	xmlns:axsl="http://www.w3.org/1999/XSL/TransformAlias" 
+	xmlns:sch="http://www.ascc.net/xml/schematron"
+    xmlns:iso="http://purl.oclc.org/dsdl/schematron" 
+    xmlns:exsl="http://exslt.org/common"
+    xmlns:msxsl="urn:schemas-microsoft-com:xslt"
+    extension-element-prefixes="exsl  msxsl"
+	 >
+<!-- This program implements ISO Schematron, except for abstract patterns which require a preprocess. -->
+  
+
+<xsl:namespace-alias stylesheet-prefix="axsl" result-prefix="xsl"/>
+
+
+<!-- Category: top-level-element -->
+<xsl:output method="xml" omit-xml-declaration="no" standalone="yes"  indent="yes"/>
+
+
+<xsl:param name="phase">
+  <xsl:choose>
+    <xsl:when test="//sch:schema/@defaultPhase">
+      <xsl:value-of select="//sch:schema/@defaultPhase"/>
+    </xsl:when>   
+    <xsl:when test="//iso:schema/@defaultPhase">
+      <xsl:value-of select="//iso:schema/@defaultPhase"/>
+    </xsl:when>
+    <xsl:otherwise>#ALL</xsl:otherwise>
+  </xsl:choose>
+</xsl:param>
+
+<xsl:param name="allow-foreign">false</xsl:param>
+
+<xsl:param name="message-newline">true</xsl:param>
+
+<!-- DPC set to true if contexts should be checked on attribute nodes
+         defaults to true if there is any possibility that a context could match an attribute,
+         err on the side if caution, a context of *[.='@'] would cause this param to defualt to true
+         even though @ is in a string
+-->
+<xsl:param name="attributes">
+  <xsl:choose>
+    <xsl:when test="//iso:rule[contains(@context,'@') or contains(@context,'attribute')]">true</xsl:when>
+    <xsl:otherwise>false</xsl:otherwise>
+  </xsl:choose>
+</xsl:param>
+
+<!-- DPC set to true if contexts should be checked on just elements in the child axis
+         defaults to true if there is any possibility that a context could match an comment or PI
+         err on the side if caution, a context of *[.='('] would cause this param to defualt to true
+         even though ( is in a string, but node() comment() and processing-instruction()  all have a (
+-->
+<xsl:param name="only-child-elements">
+  <xsl:choose>
+    <xsl:when test="//iso:rule[contains(@context,'(')]">true</xsl:when>
+    <xsl:otherwise>false</xsl:otherwise>
+  </xsl:choose>
+</xsl:param>
+
+<!-- DPC set to true if contexts should be checked on text nodes nodes (if only-child-elements is false)
+         THIS IS NON CONFORMANT BEHAVIOUR JUST FOR DISCUSSION OF A POSSIBLE CHANGE TO THE
+         SPECIFICATION. THIS PARAM SHOULD GO IF THE FINAL DECISION IS THAT THE SPEC DOES NOT CHANGE.
+	 Always defaults to false
+-->
+<xsl:param name="visit-text" select="'false'"/>
+
+<!-- DPC
+  When selecting contexts the specified behaviour is
+    @*|node()[not(self::text())]
+    The automatic settings may use
+      node()[not(self::text())]
+      @*|*
+      *
+  instead for schema for which they are equivalent.
+  If the params are set explictly the above may be used, and also either if
+      @*
+      @*|node()
+   in all cases the result may not be equivalent, for example if you specify no attributes and the schema 
+   does have attribute contexts they will be silently ignored.
+
+  after testing it turns out that
+  node()[not(self::text())] is slower in saxon than *|comment()|processing-instruction() 
+  which I find a bit surprising but anyway I'll use the longr faster version.
+-->
+<xsl:variable name="context-xpath">
+  <xsl:if test="$attributes='true'">@*|</xsl:if>
+  <xsl:choose>
+    <xsl:when test="$only-child-elements='true'">*</xsl:when>
+    <xsl:when test="$visit-text='true'">node()</xsl:when>
+    <xsl:otherwise>*|comment()|processing-instruction()</xsl:otherwise>
+  </xsl:choose>
+</xsl:variable>
+
+<!-- DPC if this is set to 
+    '' use recursive templates to iterate over document tree,
+    'key' select  all contexts with a key rather than walking the tree explictly in each mode
+    '//' select all contexts with // a key rather than walking the tree explictly in each mode (XSLT2 only)
+-->
+<xsl:param name="select-contexts" select="''"/>
+
+
+<xsl:param name="output-encoding"/>
+<!-- e.g. saxon file.xml file.xsl "sch.exslt.imports=.../string.xsl;.../math.xsl" -->
+<xsl:param name="sch.exslt.imports"/>
+
+<!-- Set the language code for messages -->
+<xsl:param name="langCode">default</xsl:param>
+
+<xsl:param name="debug">false</xsl:param>
+
+
+<!-- Set the default for schematron-select-full-path, i.e. the notation for svrl's @location-->
+<xsl:param name="full-path-notation">1</xsl:param>
+
+<!-- Simple namespace check -->
+<xsl:template match="/">
+    <xsl:if  test="//sch:*[ancestor::iso:* or descendant::iso:*]">
+	<xsl:message>Schema error: Schematron elements in old and new namespaces found</xsl:message>
+	<xsl:if test=" $debug = 'false' " />
+    </xsl:if>
+
+    <xsl:apply-templates />
+</xsl:template>
+
+
+<!-- ============================================================== -->
+<!-- ISO SCHEMATRON SCHEMA ELEMENT  -->
+<!-- Not handled: Abstract patterns. A pre-processor is assumed. -->
+<!-- ============================================================== -->
+
+<!-- SCHEMA -->
+<!-- Default uses XSLT 1 -->
+<xsl:template match="iso:schema[not(@queryBinding) or @queryBinding='xslt' 
+     or @queryBinding='xslt1' or @queryBinding='XSLT' or @queryBinding='XSLT1'
+     or @queryBinding='xpath']">
+     <xsl:if test="
+	     @queryBinding='xslt1' or @queryBinding='XSLT' or @queryBinding='XSLT1'">
+	     <xsl:message>Schema error: in the queryBinding attribute, use 'xslt'</xsl:message>
+	</xsl:if>
+	<axsl:stylesheet>
+	    <xsl:apply-templates select="iso:ns"/>
+	    <!-- Handle the namespaces before the version attribute: reported to help SAXON -->
+	    <xsl:attribute name="version">1.0</xsl:attribute>
+	    
+		<xsl:apply-templates select="." mode="stylesheetbody"/>
+		<!-- was xsl:call-template name="stylesheetbody"/ -->
+	</axsl:stylesheet>
+</xsl:template>
+
+<!-- Using EXSLT with all modeles (except function module: not applicable) -->
+<xsl:template match="iso:schema[@queryBinding='exslt']" priority="10">
+    <xsl:comment>This XSLT was automatically generated from a Schematron schema.</xsl:comment>
+	<axsl:stylesheet
+ 	  	xmlns:date="http://exslt.org/dates-and-times"
+ 	  	xmlns:dyn="http://exslt.org/dynamic"
+		xmlns:exsl="http://exslt.org/common"
+		xmlns:math="http://exslt.org/math"
+   		xmlns:random="http://exslt.org/random"
+  		xmlns:regexp="http://exslt.org/regular-expressions"
+   		xmlns:set="http://exslt.org/sets"
+   		xmlns:str="http://exslt.org/strings"
+   		extension-element-prefixes="date dyn exsl math random regexp set str" >
+	
+        <xsl:apply-templates select="iso:ns"/>
+	    <!-- Handle the namespaces before the version attribute: reported to help SAXON -->
+	    <xsl:attribute name="version">1.0</xsl:attribute>
+	    
+	    <xsl:apply-templates select="." mode="stylesheetbody"/>
+		<!-- was xsl:call-template name="stylesheetbody"/ -->
+	</axsl:stylesheet>
+</xsl:template>
+
+
+<!-- Default uses XSLT 1 -->
+<xsl:template match="iso:schema" priority="-1">
+	<xsl:message terminate="yes" >Fail: This implementation of ISO Schematron does not work with 
+	schemas using the "<xsl:value-of select="@queryBinding"/>" query language.</xsl:message>        
+</xsl:template>
+
+<xsl:template match="*" mode="stylesheetbody">
+	<!--xsl:template name="stylesheetbody"-->
+    <xsl:comment>Implementers: please note that overriding process-prolog or process-root is 
+    the preferred method for meta-stylesheets to use where possible. </xsl:comment><xsl:text>&#10;</xsl:text>
+
+   <!-- These parameters may contain strings with the name and directory of the file being
+   validated. For convenience, if the caller only has the information in a single string,
+   that string could be put in fileDirParameter. The archives parameters are available
+   for ZIP archives.
+	-->
+
+	<axsl:param name="archiveDirParameter" />
+	<axsl:param name="archiveNameParameter" />
+	<axsl:param name="fileNameParameter" />
+	<axsl:param name="fileDirParameter" />
+
+    <xsl:call-template name="iso:exslt.add.imports" />
+    <xsl:text>&#10;&#10;</xsl:text><xsl:comment>PHASES</xsl:comment><xsl:text>&#10;</xsl:text>
+	<xsl:call-template name="handle-phase"/>
+    <xsl:text>&#10;&#10;</xsl:text><xsl:comment>PROLOG</xsl:comment><xsl:text>&#10;</xsl:text>
+	<xsl:call-template name="process-prolog"/>
+    <xsl:text>&#10;&#10;</xsl:text><xsl:comment>KEYS</xsl:comment><xsl:text>&#10;</xsl:text>
+	<xsl:apply-templates mode="do-keys"   select="xsl:key  "/>
+    <xsl:text>&#10;&#10;</xsl:text><xsl:comment>DEFAULT RULES</xsl:comment><xsl:text>&#10;</xsl:text>
+    <xsl:call-template name="generate-default-rules" />
+    <xsl:text>&#10;&#10;</xsl:text><xsl:comment>SCHEMA METADATA</xsl:comment><xsl:text>&#10;</xsl:text>
+    <xsl:call-template name="handle-root"/>
+    <xsl:text>&#10;&#10;</xsl:text><xsl:comment>SCHEMATRON PATTERNS</xsl:comment><xsl:text>&#10;</xsl:text>
+ 
+	<xsl:apply-templates select="*[not(self::iso:ns)] " />
+</xsl:template>
+ 
+    <xsl:template name="iso:exslt.add.imports">
+      <xsl:param name="imports" select="$sch.exslt.imports"/>
+      <xsl:choose>
+        <xsl:when test="contains($imports, ';')">
+          <axsl:import href="{ substring-before($imports, ';') }"/>
+          <xsl:call-template name="iso:exslt.add.imports">
+            <xsl:with-param name="imports"  select="substring-after($imports, ';')"/>
+          </xsl:call-template>
+        </xsl:when>
+        <xsl:when test="$imports">
+          <axsl:import href="{ $imports }"/>
+        </xsl:when>
+      </xsl:choose>
+    </xsl:template>
+
+<xsl:template name="handle-phase" >
+	<xsl:if test="not(normalize-space( $phase ) = '#ALL')">
+	  <xsl:if test="not(iso:phase[@id = normalize-space( $phase )])">
+		  <xsl:message>Phase Error: no phase with name <xsl:value-of select="normalize-space( $phase )"
+		  /> has been defined.</xsl:message>
+	  </xsl:if>
+     </xsl:if>
+</xsl:template>
+
+<xsl:template name="generate-default-rules">
+		<xsl:text>&#10;&#10;</xsl:text>
+		<xsl:comment>MODE: SCHEMATRON-SELECT-FULL-PATH</xsl:comment><xsl:text>&#10;</xsl:text>
+		<xsl:comment>This mode can be used to generate an ugly though full XPath for locators</xsl:comment><xsl:text>&#10;</xsl:text>
+   		<axsl:template match="*" mode="schematron-select-full-path">
+   			<xsl:choose>
+   				<xsl:when test=" $full-path-notation = '1' ">
+   					<!-- Use for computers, but rather unreadable for humans -->
+					<axsl:apply-templates select="." mode="schematron-get-full-path"/>
+				</xsl:when>
+   				<xsl:when test=" $full-path-notation = '2' ">
+   					<!-- Use for humans, but no good for paths unless namespaces are known out-of-band -->
+					<axsl:apply-templates select="." mode="schematron-get-full-path-2"/>
+				</xsl:when>
+   				<xsl:when test=" $full-path-notation = '3' "> 
+   					<!-- Obsolescent. Use for humans, but no good for paths unless namespaces are known out-of-band -->
+					<axsl:apply-templates select="." mode="schematron-get-full-path-3"/>
+				</xsl:when>
+
+                   <xsl:otherwise >
+                       <!-- Use for computers, but rather unreadable for humans -->
+                    <axsl:apply-templates select="." mode="schematron-get-full-path"/>
+                </xsl:otherwise>
+			</xsl:choose>
+		</axsl:template>
+	
+
+		<xsl:text>&#10;&#10;</xsl:text>
+		<xsl:comment>MODE: SCHEMATRON-FULL-PATH</xsl:comment><xsl:text>&#10;</xsl:text>
+		<xsl:comment>This mode can be used to generate an ugly though full XPath for locators</xsl:comment><xsl:text>&#10;</xsl:text>
+   		<axsl:template match="*" mode="schematron-get-full-path">
+			<axsl:apply-templates select="parent::*" mode="schematron-get-full-path"/>
+			
+			<!-- XSLT1 syntax -->
+
+			<axsl:text>/</axsl:text>
+			<axsl:choose>
+			<axsl:when test="namespace-uri()=''">
+			<axsl:value-of select="name()"/>
+			<axsl:variable name="p_1" select="1+
+			count(preceding-sibling::*[name()=name(current())])" />
+		<axsl:if test="$p_1&gt;1 or following-sibling::*[name()=name(current())]">
+		  <xsl:text/>[<axsl:value-of select="$p_1"/>]<xsl:text/>
+		</axsl:if>
+		</axsl:when>
+		<axsl:otherwise>
+		<axsl:text>*[local-name()='</axsl:text>
+		<axsl:value-of select="local-name()"/><axsl:text>' and namespace-uri()='</axsl:text>
+		<axsl:value-of select="namespace-uri()"/>
+		<axsl:text>']</axsl:text>
+		<axsl:variable name="p_2" select="1+
+		count(preceding-sibling::*[local-name()=local-name(current())])" />
+		<axsl:if test="$p_2&gt;1 or following-sibling::*[local-name()=local-name(current())]">
+		  <xsl:text/>[<axsl:value-of select="$p_2"/>]<xsl:text/>
+		</axsl:if>
+		</axsl:otherwise>
+		</axsl:choose> 
+       	 	</axsl:template>
+       	 	
+       	 	
+		<axsl:template match="@*" mode="schematron-get-full-path">
+		
+			<!-- XSLT1 syntax -->
+		<axsl:text>/</axsl:text>
+		<axsl:choose>
+		<axsl:when test="namespace-uri()=''">@<axsl:value-of
+		select="name()"/></axsl:when>
+		<axsl:otherwise>
+		<axsl:text>@*[local-name()='</axsl:text>
+		<axsl:value-of select="local-name()"/>
+		<axsl:text>' and namespace-uri()='</axsl:text>
+		<axsl:value-of select="namespace-uri()"/>
+		<axsl:text>']</axsl:text>
+		</axsl:otherwise>
+		</axsl:choose>   
+
+		</axsl:template>
+	
+	
+	<xsl:text>&#10;&#10;</xsl:text>
+	
+	<xsl:comment>MODE: SCHEMATRON-FULL-PATH-2</xsl:comment>
+	<xsl:text>&#10;</xsl:text>
+	<xsl:comment>This mode can be used to generate prefixed XPath for humans</xsl:comment>
+	<xsl:text>&#10;</xsl:text>
+	<!--simplify the error messages by using the namespace prefixes of the
+     instance rather than the generic namespace-uri-styled qualification-->
+	<axsl:template match="node() | @*" mode="schematron-get-full-path-2">
+	<!--report the element hierarchy-->
+		<axsl:for-each select="ancestor-or-self::*">
+			<axsl:text>/</axsl:text>
+			<axsl:value-of select="name(.)"/>
+			<axsl:if test="preceding-sibling::*[name(.)=name(current())]">
+				<axsl:text>[</axsl:text>
+				<axsl:value-of
+					select="count(preceding-sibling::*[name(.)=name(current())])+1"/>
+				<axsl:text>]</axsl:text>
+			</axsl:if>
+		</axsl:for-each>
+		<!--report the attribute-->
+		<axsl:if test="not(self::*)">
+			<axsl:text/>/@<axsl:value-of select="name(.)"/>
+		</axsl:if>
+	</axsl:template>
+
+		<xsl:text>&#10;&#10;</xsl:text>
+		<xsl:comment>MODE: GENERATE-ID-FROM-PATH </xsl:comment><xsl:text>&#10;</xsl:text>
+		<!-- repeatable-id maker derived from Francis Norton's. -->
+		<!-- use this if you need generate ids in separate passes,
+		     because generate-id() is not guaranteed to produce the same
+		     results each time. These ids are not XML names but closer to paths. -->
+		<axsl:template match="/" mode="generate-id-from-path"/>
+		<axsl:template match="text()" mode="generate-id-from-path">
+			<axsl:apply-templates select="parent::*" mode="generate-id-from-path"/>
+			<axsl:value-of select="concat('.text-', 1+count(preceding-sibling::text()), '-')"/>
+		</axsl:template>
+		<axsl:template match="comment()" mode="generate-id-from-path">
+			<axsl:apply-templates select="parent::*" mode="generate-id-from-path"/>
+			<axsl:value-of select="concat('.comment-', 1+count(preceding-sibling::comment()), '-')"/>
+		</axsl:template>
+		<axsl:template match="processing-instruction()" mode="generate-id-from-path">
+			<axsl:apply-templates select="parent::*" mode="generate-id-from-path"/>
+			<axsl:value-of 
+			select="concat('.processing-instruction-', 1+count(preceding-sibling::processing-instruction()), '-')"/>
+		</axsl:template>
+		<axsl:template match="@*" mode="generate-id-from-path">
+			<axsl:apply-templates select="parent::*" mode="generate-id-from-path"/>
+			<axsl:value-of select="concat('.@', name())"/>
+		</axsl:template>
+		<axsl:template match="*" mode="generate-id-from-path" priority="-0.5">
+			<axsl:apply-templates select="parent::*" mode="generate-id-from-path"/>
+			<axsl:text>.</axsl:text>
+<!--
+			<axsl:choose>
+				<axsl:when test="count(. | ../namespace::*) = count(../namespace::*)">
+					<axsl:value-of select="concat('.namespace::-',1+count(namespace::*),'-')"/>
+				</axsl:when>
+				<axsl:otherwise>
+-->
+				<axsl:value-of 
+				select="concat('.',name(),'-',1+count(preceding-sibling::*[name()=name(current())]),'-')"/>
+<!--
+				</axsl:otherwise>
+			</axsl:choose>
+-->
+		</axsl:template>
+		
+		
+	<xsl:comment>MODE: SCHEMATRON-FULL-PATH-3</xsl:comment>
+	
+	<xsl:text>&#10;</xsl:text>
+	<xsl:comment>This mode can be used to generate prefixed XPath for humans 
+	(Top-level element has index)</xsl:comment>
+	<xsl:text>&#10;</xsl:text>
+	<!--simplify the error messages by using the namespace prefixes of the
+     instance rather than the generic namespace-uri-styled qualification-->
+	<axsl:template match="node() | @*" mode="schematron-get-full-path-3">
+	<!--report the element hierarchy-->
+		<axsl:for-each select="ancestor-or-self::*">
+			<axsl:text>/</axsl:text>
+			<axsl:value-of select="name(.)"/>
+			<axsl:if test="parent::*">
+				<axsl:text>[</axsl:text>
+				<axsl:value-of
+					select="count(preceding-sibling::*[name(.)=name(current())])+1"/>
+				<axsl:text>]</axsl:text>
+			</axsl:if>
+		</axsl:for-each>
+		<!--report the attribute-->
+		<axsl:if test="not(self::*)">
+			<axsl:text/>/@<axsl:value-of select="name(.)"/>
+		</axsl:if>
+	</axsl:template>
+
+		<xsl:text>&#10;&#10;</xsl:text>
+		<xsl:comment>MODE: GENERATE-ID-2 </xsl:comment><xsl:text>&#10;</xsl:text>
+		<!-- repeatable-id maker from David Carlisle. -->
+		<!-- use this if you need generate IDs in separate passes,
+		     because generate-id() is not guaranteed to produce the same
+		     results each time. These IDs are well-formed XML NMTOKENS -->
+	<axsl:template match="/" mode="generate-id-2">U</axsl:template>
+
+	<axsl:template match="*" mode="generate-id-2" priority="2">
+		<axsl:text>U</axsl:text>
+		<axsl:number level="multiple" count="*"/>
+	</axsl:template>
+
+	<axsl:template match="node()" mode="generate-id-2">
+		<axsl:text>U.</axsl:text>
+		<axsl:number level="multiple" count="*"/>
+		<axsl:text>n</axsl:text>
+		<axsl:number count="node()"/>
+	</axsl:template>
+
+	<axsl:template match="@*" mode="generate-id-2">
+		<axsl:text>U.</axsl:text>
+		<axsl:number level="multiple" count="*"/>
+		<axsl:text>_</axsl:text>
+		<axsl:value-of select="string-length(local-name(.))"/>
+		<axsl:text>_</axsl:text>
+		<axsl:value-of select="translate(name(),':','.')"/>
+	</axsl:template> 
+
+
+		<xsl:comment>Strip characters</xsl:comment>
+		<axsl:template match="text()" priority="-1" />
+			
+  </xsl:template>
+
+ <xsl:template name="handle-root">
+		<!-- Process the top-level element -->
+		<axsl:template match="/">
+			<xsl:call-template name="process-root">
+				<xsl:with-param 	
+				name="title" select="(@id | iso:title)[last()]"/>
+				<xsl:with-param name="version" select="'iso'" />
+				<xsl:with-param name="schemaVersion" select="@schemaVersion" />
+				<xsl:with-param name="queryBinding" select="@queryBinding" />
+				<xsl:with-param name="contents">
+					<xsl:apply-templates mode="do-all-patterns"/>
+				</xsl:with-param>
+				
+				<!-- "Rich" properties -->
+				<xsl:with-param name="fpi" select="@fpi"/>
+				<xsl:with-param name="icon" select="@icon"/>
+				<xsl:with-param name="id" select="@id"/>
+				<xsl:with-param name="lang" select="@xml:lang"/>
+				<xsl:with-param name="see" select="@see" />
+				<xsl:with-param name="space" select="@xml:space" />
+				
+				
+				<!-- Non-standard extensions not part of the API yet -->
+				<xsl:with-param name="action" select="@action" />
+			</xsl:call-template>
+		</axsl:template>
+ 
+      
+</xsl:template>
+
+<!-- ============================================================== -->
+<!-- ISO SCHEMATRON ELEMENTS -->
+<!-- ============================================================== -->
+
+	<!-- ISO ACTIVE -->
+	<xsl:template match="iso:active">
+                <xsl:if test="not(@pattern)">
+                    <xsl:message>Markup Error: no pattern attribute in &lt;active></xsl:message>
+                </xsl:if>
+
+                <xsl:if test="not(../../iso:pattern[@id = current()/@pattern])
+                and not(../../iso:include)">
+                           <xsl:message>Reference Error: the pattern  "<xsl:value-of select="@pattern"
+						   />" has been activated but is not declared</xsl:message>
+                </xsl:if>
+        </xsl:template>
+
+	<!-- ISO ASSERT and REPORT -->
+	<xsl:template match="iso:assert">
+  
+                <xsl:if test="not(@test)">
+                    <xsl:message>Markup Error: no test attribute in &lt;assert</xsl:message>
+                </xsl:if>
+        <xsl:text>&#10;&#10;		</xsl:text>
+		<xsl:comment>ASSERT <xsl:value-of select="@role" /> </xsl:comment><xsl:text>&#10;</xsl:text>      
+	
+		<axsl:choose>
+			<axsl:when test="{@test}"/>
+			<axsl:otherwise>
+				<xsl:call-template name="process-assert">
+					<xsl:with-param name="test" select="normalize-space(@test)" />
+					<xsl:with-param name="diagnostics" select="@diagnostics"/>
+					<xsl:with-param name="flag" select="@flag"/>
+					
+					<!-- "Rich" properties -->
+					<xsl:with-param name="fpi" select="@fpi"/>
+					<xsl:with-param name="icon" select="@icon"/>
+					<xsl:with-param name="id" select="@id"/>
+					<xsl:with-param name="lang" select="@xml:lang"/>
+					<xsl:with-param name="see" select="@see" />
+					<xsl:with-param name="space" select="@xml:space" />
+					
+					<!-- "Linking" properties -->
+					<xsl:with-param name="role" select="@role" />
+					<xsl:with-param name="subject" select="@subject" />
+				</xsl:call-template>
+ 			
+			</axsl:otherwise>
+		</axsl:choose>
+	</xsl:template>
+	<xsl:template match="iso:report">
+		 
+                <xsl:if test="not(@test)">
+                    <xsl:message>Markup Error: no test attribute in &lt;report></xsl:message>
+                </xsl:if>
+                
+        <xsl:text>&#10;&#10;		</xsl:text>
+		<xsl:comment>REPORT <xsl:value-of select="@role" /> </xsl:comment><xsl:text>&#10;</xsl:text>      
+	
+		<axsl:if test="{@test}">
+		
+			<xsl:call-template name="process-report">
+				<xsl:with-param name="test" select="normalize-space(@test)" />
+				<xsl:with-param name="diagnostics" select="@diagnostics"/>
+					<xsl:with-param name="flag" select="@flag"/>
+					
+					<!-- "Rich" properties -->
+					<xsl:with-param name="fpi" select="@fpi"/>
+					<xsl:with-param name="icon" select="@icon"/>
+					<xsl:with-param name="id" select="@id"/>
+					<xsl:with-param name="lang" select="@xml:lang"/>
+					<xsl:with-param name="see" select="@see" />
+					<xsl:with-param name="space" select="@xml:space" />
+					
+					<!-- "Linking" properties -->
+					<xsl:with-param name="role" select="@role" />
+					<xsl:with-param name="subject" select="@subject" />
+			</xsl:call-template>
+				
+		</axsl:if>
+	</xsl:template>
+
+
+	<!-- ISO DIAGNOSTIC -->
+	<!-- We use a mode here to maintain backwards compatability, instead of adding it
+	     to the other mode.
+	-->
+	<xsl:template match="iso:diagnostic" mode="check-diagnostics">
+              <xsl:if test="not(@id)">
+                    <xsl:message>Markup Error: no id attribute in &lt;diagnostic></xsl:message>
+               </xsl:if>
+    </xsl:template>
+    
+    <xsl:template match="iso:diagnostic"  >
+                <xsl:call-template name="process-diagnostic">
+                
+					<!-- "Rich" properties -->
+					<xsl:with-param name="fpi" select="@fpi"/>
+					<xsl:with-param name="icon" select="@icon"/>
+					<xsl:with-param name="id" select="@id"/>
+					<xsl:with-param name="lang" select="@xml:lang"/>
+					<xsl:with-param name="see" select="@see" />
+					<xsl:with-param name="space" select="@xml:space" />
+               </xsl:call-template>
+        </xsl:template>
+
+	<!-- ISO DIAGNOSTICS -->
+	<xsl:template match="iso:diagnostics" >
+		<xsl:apply-templates mode="check-diagnostics" select="*" />
+	</xsl:template>
+
+	<!-- ISO DIR -->
+	<xsl:template match="iso:dir"  mode="text" >
+		<xsl:call-template name="process-dir">
+			<xsl:with-param name="value" select="@value"/>
+		</xsl:call-template>
+	</xsl:template>
+
+	<!-- ISO EMPH -->
+	<xsl:template match="iso:emph"  mode="text">
+	 
+		<xsl:call-template name="process-emph"/> 
+
+	</xsl:template>
+
+	<!-- ISO EXTENDS -->
+	<xsl:template match="iso:extends">
+		<xsl:if test="not(@rule)">
+                   <xsl:message>Markup Error: no rule attribute in &lt;extends></xsl:message>
+                </xsl:if>
+     		<xsl:if test="not(//iso:rule[@abstract='true'][@id= current()/@rule] )">
+                    <xsl:message>Reference Error: the abstract rule  "<xsl:value-of select="@rule"
+					/>" has been referenced but is not declared</xsl:message>
+                </xsl:if>
+	        <xsl:call-template name="IamEmpty" />
+
+  		<xsl:if test="//iso:rule[@id=current()/@rule]">
+    			<xsl:apply-templates select="//iso:rule[@id=current()/@rule]"
+				mode="extends"/>
+  		</xsl:if>
+
+	</xsl:template>
+
+	<!-- KEY: ISO has no KEY -->
+	<!-- NOTE: 
+	     Key has had a checkered history. Schematron 1.0 allowed it in certain places, but
+	     users came up with a different location, which has now been adopted. 
+	     
+	     XT, the early XSLT processor, did not implement key and died when it was present. 
+	     So there are some versions of the Schematron skeleton for XT that strip out all
+	     key elements.
+	     
+	     Xalan (e.g. Xalan4C 1.0 and a Xalan4J) also had a funny. A fix involved making 
+	     a top-level parameter called $hiddenKey and then using that instead of matching
+	     "key". This has been removed.
+	-->
+	<xsl:template  match="xsl:key" mode="do-keys" >
+	     <xsl:if test="not(@name)">
+              <xsl:message>Markup Error: no name attribute in &lt;key></xsl:message>
+         </xsl:if>
+                <xsl:if test="not(@path) and not(@use)">
+                    <xsl:message>Markup Error: no path or use attribute in &lt;key></xsl:message>
+                </xsl:if>         
+	     <xsl:choose>
+	     	<xsl:when test="parent::iso:rule ">
+	        <xsl:call-template name="IamEmpty" />
+	       <xsl:choose>
+	       	<xsl:when test="@path">
+				<axsl:key match="{../@context}" name="{@name}" use="{@path}"/>
+			</xsl:when>
+			<xsl:otherwise>
+							<axsl:key match="{../@context}" name="{@name}" use="{@use}"/>
+			</xsl:otherwise>
+			</xsl:choose>	
+		</xsl:when>
+		<xsl:otherwise>
+                <xsl:if test="not(@match) ">
+                    <xsl:message>Markup Error: no path or use attribute in &lt;key></xsl:message>
+                </xsl:if>   		
+			<axsl:key>
+      			<xsl:copy-of select="@*"/>
+    		</axsl:key>	
+		</xsl:otherwise>
+		</xsl:choose>
+	</xsl:template>
+
+	<xsl:template match="xsl:key "  /><!-- swallow -->
+
+	<xsl:template match="iso:key "  >
+		<xsl:message>Schema error: The key element is not in the ISO Schematron namespace. Use the XSLT namespace.</xsl:message>
+    </xsl:template>
+
+   <!-- ISO INCLUDE -->
+   <!-- This is only a fallback. Include really needs to have been done before this as a separate pass.-->
+
+   <xsl:template match="iso:include[not(normalize-space(@href))]"
+	   priority="1">
+	<xsl:if test=" $debug = 'false' ">
+		<xsl:message terminate="yes">Schema error: Empty href= attribute for include directive.</xsl:message>
+	</xsl:if>
+
+   </xsl:template>
+
+   <!-- Extend the URI syntax to allow # refererences -->
+   <!-- Add experimental support for simple containers like  /xxx:xxx/iso:pattern to allow better includes -->
+   <xsl:template match="iso:include">
+       <xsl:variable name="document-uri" select="substring-before(concat(@href,'#'), '#')"/>
+       <xsl:variable name="fragment-id" select="substring-after(@href, '#')"/>
+       
+       <xsl:choose> 
+          
+          <xsl:when test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0" >
+          	<xsl:message>Error: Impossible URL in Schematron include</xsl:message>
+          </xsl:when> 
+          
+          <xsl:when test="string-length( $fragment-id ) &gt; 0">
+              <xsl:variable name="theDocument_1" select="document( $document-uri,/ )" />
+              <xsl:variable name="theFragment_1" select="$theDocument_1//iso:*[@id= $fragment-id ]" />
+              <xsl:if test=" $theFragment_1/self::iso:schema ">
+                 <xsl:message>Schema error: Use include to include fragments, not a whole schema</xsl:message>
+              </xsl:if>
+              <xsl:apply-templates select=" $theFragment_1"/>
+		   </xsl:when>
+		  
+		   <xsl:otherwise>
+              <xsl:variable name="theDocument_2" select="document( $document-uri,/ )" />
+              <xsl:variable name="theFragment_2" select="$theDocument_2/iso:*" />
+              <xsl:variable name="theContainedFragments" select="$theDocument_2/*/iso:*" />
+              <xsl:if test=" $theFragment_2/self::iso:schema or $theContainedFragments/self::iso:schema">
+                 <xsl:message>Schema error: Use include to include fragments, not a whole schema</xsl:message>
+              </xsl:if>
+       		<xsl:apply-templates select="$theFragment_2 | $theContainedFragments "/>
+       	   </xsl:otherwise>
+       </xsl:choose>
+   </xsl:template>
+
+   <!-- This is to handle the particular case of including patterns -->  
+   <xsl:template match="iso:include" mode="do-all-patterns">
+       <xsl:variable name="document-uri" select="substring-before(concat(@href,'#'), '#')"/>
+       <xsl:variable name="fragment-id" select="substring-after(@href, '#')"/>
+ 
+       <xsl:choose> 
+          
+          <xsl:when test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0" >
+          	<xsl:message>Error: Impossible URL in Schematron include</xsl:message>
+          </xsl:when> 
+          
+          <xsl:when test="string-length( $fragment-id ) &gt; 0">
+              <xsl:variable name="theDocument_1" select="document( $document-uri,/ )" />
+              <xsl:variable name="theFragment_1" select="$theDocument_1//iso:*[@id= $fragment-id ]" />
+              <xsl:if test=" $theFragment_1/self::iso:schema ">
+                 <xsl:message>Schema error: Use include to include fragments, not a whole schema</xsl:message>
+              </xsl:if>
+              <xsl:apply-templates select=" $theFragment_1" mode="do-all-patterns"/>
+		   </xsl:when>
+		  
+		   <xsl:otherwise>
+		   	  <!-- Import the top-level element if it is in schematron namespace,
+		   	  or its children otherwise, to allow a simple containment mechanism. -->
+              <xsl:variable name="theDocument_2" select="document( $document-uri,/ )" />
+              <xsl:variable name="theFragment_2" select="$theDocument_2/iso:*" />
+              <xsl:variable name="theContainedFragments" select="$theDocument_2/*/iso:*" />
+              <xsl:if test=" $theFragment_2/self::iso:schema or $theContainedFragments/self::iso:schema">
+                 <xsl:message>Schema error: Use include to include fragments, not a whole schema</xsl:message>
+              </xsl:if>
+       		<xsl:apply-templates select="$theFragment_2 | $theContainedFragments "
+       		mode="do-all-patterns" />
+       	   </xsl:otherwise>
+       </xsl:choose>
+   </xsl:template>
+   
+	<!-- ISO LET -->
+	<xsl:template match="iso:let" >
+	  <xsl:if test="ancestor::iso:schema[@queryBinding='xpath']">
+                    <xsl:message>Warning: Variables should not be used with the "xpath" query language binding.</xsl:message>
+       </xsl:if>
+		
+       <!-- lets at the top-level are implemented as parameters -->
+ 
+       	<xsl:choose>
+       		<xsl:when test="parent::iso:schema">
+       			<!-- it is an error to have an empty param/@select because an XPath is expected -->
+	      		 <axsl:param name="{@name}" select="{@value}">
+	      		 		<xsl:if test="string-length(@value) &gt; 0">
+	      		 			<xsl:attribute name="select"><xsl:value-of select="@value"/></xsl:attribute>
+	      		 		</xsl:if>
+	      		 </axsl:param> 
+       		</xsl:when>
+       		<xsl:otherwise>
+				<axsl:variable name="{@name}" select="{@value}"/>
+			</xsl:otherwise>
+		</xsl:choose>
+		  
+	</xsl:template>	
+
+	<!-- ISO NAME -->
+	<xsl:template match="iso:name" mode="text">
+	
+		<xsl:if test="@path">
+			<xsl:call-template name="process-name">
+				<xsl:with-param name="name" select="concat('name(',@path,')')"/>
+			</xsl:call-template>
+		</xsl:if>
+		<xsl:if test="not(@path)">
+			<xsl:call-template name="process-name">
+				<xsl:with-param name="name" select="'name(.)'"/>
+			</xsl:call-template>
+		</xsl:if>
+	    <xsl:call-template name="IamEmpty" />
+	</xsl:template>
+
+	<!-- ISO NS -->
+	<!-- Namespace handling is XSLT is quite tricky and implementation dependent -->
+	<xsl:template match="iso:ns">
+ 		<xsl:call-template name="handle-namespace" />
+	</xsl:template>
+
+    <!-- This template is just to provide the API hook -->
+	<xsl:template match="iso:ns"  mode="do-all-patterns" >
+               <xsl:if test="not(@uri)">
+                    <xsl:message>Markup Error: no uri attribute in &lt;ns></xsl:message>
+                </xsl:if>
+               <xsl:if test="not(@prefix)">
+                    <xsl:message>Markup Error: no prefix attribute in &lt;ns></xsl:message>
+                </xsl:if>
+	        <xsl:call-template name="IamEmpty" />
+		<xsl:call-template name="process-ns" >
+			<xsl:with-param name="prefix" select="@prefix"/>
+			<xsl:with-param name="uri" select="@uri"/>
+		</xsl:call-template>
+	</xsl:template>
+
+	<!-- ISO P -->
+	<xsl:template match="iso:schema/iso:p " mode="do-schema-p" >
+		<xsl:call-template name="process-p">
+			<xsl:with-param name="class" select="@class"/>
+			<xsl:with-param name="icon" select="@icon"/>
+			<xsl:with-param name="id" select="@id"/>
+			<xsl:with-param name="lang" select="@xml:lang"/>
+		</xsl:call-template>
+	</xsl:template>
+	<xsl:template match="iso:pattern/iso:p " mode="do-pattern-p" >
+		<xsl:call-template name="process-p">
+			<xsl:with-param name="class" select="@class"/>
+			<xsl:with-param name="icon" select="@icon"/>
+			<xsl:with-param name="id" select="@id"/>
+			<xsl:with-param name="lang" select="@xml:lang"/>
+		</xsl:call-template>
+	</xsl:template>
+	
+    <!-- Currently, iso:p in other position are not passed through to the API -->
+	<xsl:template match="iso:phase/iso:p" />
+	<xsl:template match="iso:p " priority="-1" />
+
+	<!-- ISO PATTERN -->
+	<xsl:template match="iso:pattern" mode="do-all-patterns">
+	<xsl:if test="($phase = '#ALL') 
+	or (../iso:phase[@id= $phase]/iso:active[@pattern= current()/@id])">
+		<xsl:call-template name="process-pattern">
+			<!-- the following select statement assumes that
+			@id | sch:title returns node-set in document order:
+			we want the title if it is there, otherwise the @id attribute -->
+			<xsl:with-param name="name" select="(@id | iso:title )[last()]"/>
+			<xsl:with-param name="is-a" select="''"/>
+			
+					<!-- "Rich" properties -->
+					<xsl:with-param name="fpi" select="@fpi"/>
+					<xsl:with-param name="icon" select="@icon"/>
+					<xsl:with-param name="id" select="@id"/>
+					<xsl:with-param name="lang" select="@xml:lang"/>
+					<xsl:with-param name="see" select="@see" />
+					<xsl:with-param name="space" select="@xml:space" />
+		</xsl:call-template>
+		<xsl:choose>
+		  <xsl:when test="$select-contexts='key'">
+		    <axsl:apply-templates select="key('M','M{count(preceding-sibling::*)}')" mode="M{count(preceding-sibling::*)}"/>
+		  </xsl:when>
+		  <xsl:when test="$select-contexts='//'">
+		    <axsl:apply-templates mode="M{count(preceding-sibling::*)}">
+		      <xsl:attribute name="select">
+			<xsl:text>//(</xsl:text>
+			<xsl:for-each select="iso:rule/@context">
+			  <xsl:text>(</xsl:text>
+			  <xsl:value-of select="."/>
+			  <xsl:text>)</xsl:text>
+			  <xsl:if test="position()!=last()">|</xsl:if>
+			</xsl:for-each>
+			<xsl:text>)</xsl:text>
+			<xsl:if test="$visit-text='false'">[not(self::text())]</xsl:if>
+		      </xsl:attribute>
+		    </axsl:apply-templates>
+		  </xsl:when>
+		  <xsl:otherwise>
+		    <axsl:apply-templates select="/" mode="M{count(preceding-sibling::*)}"/>
+		  </xsl:otherwise>
+		</xsl:choose>
+        </xsl:if>
+	</xsl:template>
+	
+	<xsl:template match="iso:pattern[@abstract='true']">
+    
+             <xsl:message>Schema implementation error: This schema has abstract patterns, yet they are supposed to be preprocessed out already
+             </xsl:message>
+    </xsl:template>
+
+    <!-- Here is the template for the normal case of patterns -->
+	<xsl:template match="iso:pattern[not(@abstract='true')]">
+     
+      <xsl:if test="($phase = '#ALL') 
+	          or (../iso:phase[@id= $phase]/iso:active[@pattern= current()/@id])">
+ 
+		<xsl:text>&#10;&#10;</xsl:text>
+		<xsl:comment>PATTERN <xsl:value-of select="@id" /> <xsl:value-of select="iso:title" /> </xsl:comment><xsl:text>&#10;</xsl:text>      
+		<xsl:apply-templates />
+		
+		<!-- DPC select-contexts test -->
+		<xsl:if test="not($select-contexts)">
+		  <axsl:template match="text()" priority="-1" mode="M{count(preceding-sibling::*)}">
+		    <!-- strip characters -->
+		  </axsl:template>
+		  
+		  <!-- DPC introduce context-xpath variable -->
+		  <axsl:template match="@*|node()"
+				 priority="-2"
+				 mode="M{ count(preceding-sibling::*) }">
+		    <axsl:apply-templates select="{$context-xpath}" mode="M{count(preceding-sibling::*)}"/>
+		  </axsl:template>
+		</xsl:if>
+      </xsl:if>
+	</xsl:template>
+
+	<!-- ISO PHASE -->
+	<xsl:template match="iso:phase" >
+                <xsl:if test="not(@id)">
+                    <xsl:message>Markup Error: no id attribute in &lt;phase></xsl:message>
+                </xsl:if>
+		  <xsl:apply-templates/>
+	</xsl:template>
+
+	<!-- ISO RULE -->
+	<xsl:template match="iso:rule[not(@abstract='true')] ">
+                <xsl:if test="not(@context)">
+                    <xsl:message>Markup Error: no context attribute in &lt;rule></xsl:message>
+                </xsl:if>
+        <xsl:text>&#10;&#10;	</xsl:text>
+		<xsl:comment>RULE <xsl:value-of select="@id" /> </xsl:comment><xsl:text>&#10;</xsl:text>   
+        <xsl:if test="iso:title">
+		    <xsl:comment><xsl:value-of select="iso:title" /></xsl:comment>
+		  </xsl:if>
+		<!-- DPC select-contexts -->
+		<xsl:if test="$select-contexts='key'">
+		    <axsl:key name="M"
+			      match="{@context}" 
+			      use="'M{count(../preceding-sibling::*)}'"/>
+		</xsl:if>
+   
+	
+<!-- DPC priorities count up from 1000 not down from 4000 (templates in same priority order as before) -->
+		<axsl:template match="{@context}"
+		priority="{1000 + count(following-sibling::*)}" mode="M{count(../preceding-sibling::*)}">
+			<xsl:call-template name="process-rule">
+				<xsl:with-param name="context" select="@context"/>
+				
+					<!-- "Rich" properties -->
+					<xsl:with-param name="fpi" select="@fpi"/>
+					<xsl:with-param name="icon" select="@icon"/>
+					<xsl:with-param name="id" select="@id"/>
+					<xsl:with-param name="lang" select="@xml:lang"/>
+					<xsl:with-param name="see" select="@see" />
+					<xsl:with-param name="space" select="@xml:space" />
+					
+					<!-- "Linking" properties -->
+					<xsl:with-param name="role" select="@role" />
+					<xsl:with-param name="subject" select="@subject" />
+			</xsl:call-template>
+			<xsl:apply-templates/>
+			<!-- DPC introduce context-xpath and select-contexts variables -->
+			<xsl:if test="not($select-contexts)">
+			  <axsl:apply-templates select="{$context-xpath}" mode="M{count(../preceding-sibling::*)}"/>
+			</xsl:if>
+		</axsl:template>
+	</xsl:template>
+
+
+	<!-- ISO ABSTRACT RULE -->
+	<xsl:template match="iso:rule[@abstract='true'] " >
+		<xsl:if test=" not(@id)">
+                    <xsl:message>Markup Error: no id attribute on abstract &lt;rule></xsl:message>
+                </xsl:if>
+ 		<xsl:if test="@context">
+                    <xsl:message>Markup Error: (2) context attribute on abstract &lt;rule></xsl:message>
+                </xsl:if>
+	</xsl:template>
+
+	<xsl:template match="iso:rule[@abstract='true']"
+		mode="extends" >
+                <xsl:if test="@context">
+                    <xsl:message>Markup Error: context attribute on abstract &lt;rule></xsl:message>
+                </xsl:if>
+			<xsl:apply-templates/>
+	</xsl:template>
+
+	<!-- ISO SPAN -->
+	<xsl:template match="iso:span" mode="text">
+		<xsl:call-template name="process-span">
+			<xsl:with-param name="class" select="@class"/>
+		</xsl:call-template>
+	</xsl:template>
+
+	<!-- ISO TITLE -->
+	
+	<xsl:template match="iso:schema/iso:title"  priority="1">
+	     <xsl:call-template name="process-schema-title" />
+	</xsl:template>
+ 
+	
+	<xsl:template match="iso:title" >
+	     <xsl:call-template name="process-title" />
+	</xsl:template>
+ 
+
+	<!-- ISO VALUE-OF -->
+	<xsl:template match="iso:value-of" mode="text" >
+        <xsl:if test="not(@select)">
+            <xsl:message>Markup Error: no select attribute in &lt;value-of></xsl:message>
+        </xsl:if>
+	    <xsl:call-template name="IamEmpty" />
+	         
+		<xsl:choose>
+			<xsl:when test="@select">
+				<xsl:call-template name="process-value-of">
+					<xsl:with-param name="select" select="@select"/>  
+				</xsl:call-template>
+			</xsl:when>
+			<xsl:otherwise >
+				<xsl:call-template name="process-value-of">
+					<xsl:with-param name="select" select="'.'"/>
+				</xsl:call-template>
+			</xsl:otherwise>
+        </xsl:choose> 
+        
+	</xsl:template>
+
+
+<!-- ============================================================== -->
+<!-- DEFAULT TEXT HANDLING  -->
+<!-- ============================================================== -->
+	<xsl:template match="text()" priority="-1" mode="do-keys">
+		<!-- strip characters -->
+	</xsl:template>
+	<xsl:template match="text()" priority="-1" mode="do-all-patterns">
+		<!-- strip characters -->
+	</xsl:template>
+        <xsl:template match="text()" priority="-1" mode="do-schema-p">
+		<!-- strip characters -->
+	</xsl:template>
+        <xsl:template match="text()" priority="-1" mode="do-pattern-p">
+		<!-- strip characters -->
+	</xsl:template>
+	
+	<xsl:template match="text()" priority="-1">
+		<!-- Strip characters -->
+	</xsl:template>
+	
+	<xsl:template match="text()" mode="text">
+		<xsl:value-of select="."/>
+	</xsl:template>
+
+	<xsl:template match="text()" mode="inline-text">
+		<xsl:value-of select="."/>
+	</xsl:template>
+
+<!-- ============================================================== -->
+<!-- UTILITY TEMPLATES -->
+<!-- ============================================================== -->
+<xsl:template name="IamEmpty">
+	<xsl:if test="count( * )">
+		<xsl:message>
+			<xsl:text>Warning: </xsl:text>
+			<xsl:value-of select="name(.)"/>
+			<xsl:text> must not contain any child elements</xsl:text>
+		</xsl:message>
+	</xsl:if>
+</xsl:template>
+
+<xsl:template name="diagnosticsSplit">
+  <!-- Process at the current point the first of the <diagnostic> elements
+       referred to parameter str, and then recurse -->
+  <xsl:param name="str"/>
+  <xsl:variable name="start">
+    <xsl:choose>
+      <xsl:when test="contains($str,' ')">
+	<xsl:value-of  select="substring-before($str,' ')"/>
+      </xsl:when>
+      <xsl:otherwise><xsl:value-of select="$str"/></xsl:otherwise>
+    </xsl:choose>
+  </xsl:variable>
+
+  <xsl:variable name="end">
+    <xsl:if test="contains($str,' ')">
+      <xsl:value-of select="substring-after($str,' ')"/>
+    </xsl:if>
+  </xsl:variable>
+
+  <!-- This works with all namespaces -->
+  <xsl:if test="not(string-length(normalize-space($start)) = 0)
+  		and not(//iso:diagnostic[@id = $start])
+		and not(//sch:diagnostic[@id = $start]) 
+		and not(//diagnostic[@id = $start])">
+	<xsl:message>Reference error: A diagnostic "<xsl:value-of select="string($start)"
+	/>" has been referenced but is not declared</xsl:message>
+  </xsl:if>
+
+  <xsl:if test="string-length(normalize-space($start)) > 0">
+     <xsl:text> </xsl:text>
+     <xsl:apply-templates 
+        select="//iso:diagnostic[@id = $start ]
+        	| //sch:diagnostic[@id = $start ] 
+            | //diagnostic[@id= $start ]"/>
+  </xsl:if>
+
+  <xsl:if test="not($end='')">
+    <xsl:call-template name="diagnosticsSplit">
+      <xsl:with-param name="str" select="$end"/>
+    </xsl:call-template>
+  </xsl:if>
+</xsl:template>
+
+<!-- It would be nice to use this but xsl:namespace does not
+  allow a fallback -->
+<!--xsl:template name="handle-namespace" version="2.0">
+   <xsl:namespace name="{@prefix}" select="@uri">
+</xsl:template-->
+
+<xsl:template name="handle-namespace">
+       <!-- experimental code from http://eccnet.eccnet.com/pipermail/schematron-love-in/2006-June/000104.html -->
+       <!-- Handle namespaces differently for exslt systems, msxml, and default, only using XSLT1 syntax -->
+       <!-- For more info see  http://fgeorges.blogspot.com/2007/01/creating-namespace-nodes-in-xslt-10.html -->
+       <xsl:choose>
+          <!-- The following code works for XSLT1 -->
+        <xsl:when test="function-available('exsl:node-set')">
+           <xsl:variable name="ns-dummy-elements">
+             <xsl:element name="{@prefix}:dummy" namespace="{@uri}"/>
+           </xsl:variable>
+       	   <xsl:variable name="p" select="@prefix"/>
+           <xsl:copy-of select="exsl:node-set($ns-dummy-elements)
+                                  /*/namespace::*[local-name()=$p]"/>
+         </xsl:when>        
+
+   			<!-- End XSLT1  code -->
+  
+        <!-- Not tested yet       
+    	<xsl:when test="function-available('msxsl:node-set')">
+      		<xsl:variable name="ns-dummy-elements">
+        		<xsl:element name="{ $prefix }:e" namespace="{ $uri }"/>
+      		</xsl:variable>
+      		<xsl:copy-of select="msxsl:node-set($ns-dummy-elements)/*/namespace::*"/>
+    	</xsl:when>
+        -->
+        
+        <xsl:when test="@prefix = 'xsl' ">
+           <!-- Do not generate dummy attributes with the xsl: prefix, as these
+                are errors against XSLT, because we presume that the output
+                stylesheet uses the xsl prefix. In any case, there would already
+                be a namespace declaration for the XSLT namespace generated
+                automatically, presumably using "xsl:".
+           -->
+        </xsl:when>
+        
+        <xsl:when test="@uri = 'http://www.w3.org/1999/XSL/Transform'">
+          <xsl:message terminate="yes">
+            <xsl:text>Using the XSLT namespace with a prefix other than "xsl" in </xsl:text>
+            <xsl:text>Schematron rules is not supported </xsl:text>
+            <xsl:text>in this processor: </xsl:text>
+            <xsl:value-of select="system-property('xsl:vendor')"/>
+          </xsl:message>
+        </xsl:when>
+
+        <xsl:otherwise>
+          <xsl:attribute name="{concat(@prefix,':dummy-for-xmlns')}" namespace="{@uri}" />
+           
+        </xsl:otherwise>
+      </xsl:choose>
+
+
+</xsl:template>
+
+<!-- ============================================================== -->
+<!-- UNEXPECTED ELEMENTS -->
+<!-- ============================================================== -->
+
+	<xsl:template match="iso:*"  priority="-2">
+	   <xsl:message>
+			<xsl:text>Error: unrecognized element in ISO Schematron namespace: check spelling
+			and capitalization</xsl:text>
+			<xsl:value-of select="name(.)"/>
+		</xsl:message>
+	</xsl:template>
+	
+	
+	<!-- Swallow old namespace elements: there is an upfront test for them elsewhere -->
+	<xsl:template match="sch:*"  priority="-2" />
+	
+	<xsl:template match="*"  priority="-3">
+	    <xsl:choose>
+	       <xsl:when test=" $allow-foreign = 'false' ">
+				<xsl:message>
+					<xsl:text>Warning: unrecognized element </xsl:text>
+					<xsl:value-of select="name(.)"/>
+				</xsl:message>
+			</xsl:when>
+			<xsl:otherwise>
+				<xsl:copy-of select="." />
+			</xsl:otherwise>
+		</xsl:choose>
+	</xsl:template>
+	
+	<xsl:template match="iso:*" mode="text" priority="-2" />
+	<xsl:template match="*" mode="text" priority="-3">
+	    <xsl:choose>
+	       <xsl:when test=" $allow-foreign = 'false' ">
+				<xsl:message>
+					<xsl:text>Warning: unrecognized element </xsl:text>
+					<xsl:value-of select="name(.)"/>
+				</xsl:message>
+			</xsl:when>
+			<xsl:otherwise>
+				<xsl:copy-of select="." />
+			</xsl:otherwise>
+		</xsl:choose>
+	</xsl:template>
+
+<!-- ============================================================== -->
+<!-- DEFAULT NAMED TEMPLATES -->
+<!-- These are the actions that are performed unless overridden -->
+<!-- ============================================================== -->
+ 
+	<xsl:template name="process-prolog"/>
+	<!-- no params -->
+
+	<xsl:template name="process-root">
+		<xsl:param name="contents"/>
+		<xsl:param name="id" />
+		<xsl:param name="version" />
+		<xsl:param name="schemaVersion" />
+		<xsl:param name="queryBinding" />
+		<xsl:param name="title" />
+
+
+		<!-- "Rich" parameters -->
+		<xsl:param name="fpi" />
+		<xsl:param name="icon" />
+		<xsl:param name="lang" />
+		<xsl:param name="see" />
+		<xsl:param name="space" />
+
+		<xsl:copy-of select="$contents"/>
+	</xsl:template>
+
+	<xsl:template name="process-assert">
+
+		<xsl:param name="test"/>
+		<xsl:param name="diagnostics" />
+		<xsl:param name="id" />
+		<xsl:param name="flag" />
+
+           	<!-- "Linkable" parameters -->
+		<xsl:param name="role"/>
+		<xsl:param name="subject"/>
+
+		<!-- "Rich" parameters -->
+		<xsl:param name="fpi" />
+		<xsl:param name="icon" />
+		<xsl:param name="lang" />
+		<xsl:param name="see" />
+		<xsl:param name="space" />
+
+
+		<xsl:call-template name="process-message">
+			<xsl:with-param name="pattern" select="$test"/>
+			<xsl:with-param name="role" select="$role"/>
+		</xsl:call-template>
+		
+		
+	</xsl:template>
+
+	<xsl:template name="process-report">
+		<xsl:param name="test"/>
+		<xsl:param name="diagnostics" />
+		<xsl:param name="id" />
+		<xsl:param name="flag" />
+
+           	<!-- "Linkable" parameters -->
+		<xsl:param name="role"/>
+		<xsl:param name="subject"/>
+
+		<!-- "Rich" parameters -->
+		<xsl:param name="fpi" />
+		<xsl:param name="icon" /> 
+		<xsl:param name="lang" />
+		<xsl:param name="see" />
+		<xsl:param name="space" />
+
+		<xsl:call-template name="process-message">
+			<xsl:with-param name="pattern" select="$test"/>
+			<xsl:with-param name="role" select="$role"/>
+		</xsl:call-template>
+	</xsl:template>
+
+	<xsl:template name="process-diagnostic">
+		<xsl:param name="id" />
+
+		<!-- "Rich" parameters -->
+		<xsl:param name="fpi" />
+		<xsl:param name="icon" />
+		<xsl:param name="lang" />
+		<xsl:param name="see" />
+		<xsl:param name="space" />
+		
+	    <!-- We generate too much whitespace rather than risking concatenation -->
+		<axsl:text> </axsl:text>
+		<xsl:apply-templates mode="text"/>
+		<axsl:text> </axsl:text>
+	</xsl:template>
+
+	<xsl:template name="process-dir">
+      	<xsl:param name="value" />
+
+	    <!-- We generate too much whitespace rather than risking concatenation -->
+		<axsl:text> </axsl:text>
+		<xsl:apply-templates mode="inline-text"/>
+		<axsl:text> </axsl:text>
+	</xsl:template>
+
+	<xsl:template name="process-emph"> 
+	    <!-- We generate too much whitespace rather than risking concatenation -->
+		<axsl:text> </axsl:text>
+		<xsl:apply-templates mode="inline-text"/>
+		<axsl:text> </axsl:text>
+	</xsl:template>
+	
+	<xsl:template name="process-name">
+		<xsl:param name="name"/>
+		
+		<!-- We generate too much whitespace rather than risking concatenation -->
+		<axsl:text> </axsl:text>
+		<axsl:value-of select="{$name}"/>
+		<axsl:text> </axsl:text>
+		
+    </xsl:template>
+
+	<xsl:template name="process-ns" >
+	<!-- Note that process-ns is for reporting. The sch:ns elements are 
+	     independently used in the sch:schema template to provide namespace bindings -->
+		<xsl:param name="prefix"/>
+		<xsl:param name="uri" />
+      </xsl:template>
+
+	<xsl:template name="process-p">
+		<xsl:param name="id" />
+		<xsl:param name="class" />
+		<xsl:param name="icon" />
+		<xsl:param name="lang" />
+      </xsl:template>
+
+	<xsl:template name="process-pattern">
+		<xsl:param name="id" />
+		<xsl:param name="name" />
+		<xsl:param name="is-a" />
+
+		<!-- "Rich" parameters -->
+		<xsl:param name="fpi" />
+		<xsl:param name="icon" />
+		<xsl:param name="lang" />
+		<xsl:param name="see" />
+		<xsl:param name="space" />
+      </xsl:template>
+      
+
+	<xsl:template name="process-rule">
+		<xsl:param name="context" />
+
+		<xsl:param name="id" />
+		<xsl:param name="flag" />
+
+           	<!-- "Linkable" parameters -->
+		<xsl:param name="role"/>
+		<xsl:param name="subject"/>
+  
+		<!-- "Rich" parameters -->
+		<xsl:param name="fpi" />
+		<xsl:param name="icon" />
+		<xsl:param name="lang" />
+		<xsl:param name="see" />
+		<xsl:param name="space" />
+      </xsl:template>
+
+	<xsl:template name="process-span" >
+		<xsl:param name="class" />
+
+	    <!-- We generate too much whitespace rather than risking concatenation -->
+		<axsl:text> </axsl:text>
+		<xsl:apply-templates mode="inline-text"/>
+		<axsl:text> </axsl:text>		
+	</xsl:template>
+
+	<xsl:template name="process-title" >
+		<xsl:param name="class" />
+	   <xsl:call-template name="process-p">
+	      <xsl:with-param  name="class">title</xsl:with-param>
+	   </xsl:call-template>
+	</xsl:template>
+		
+	<xsl:template name="process-schema-title" >
+		<xsl:param name="class" />
+	   <xsl:call-template name="process-title">
+	      <xsl:with-param  name="class">schema-title</xsl:with-param>
+	   </xsl:call-template>
+	</xsl:template>
+
+	<xsl:template name="process-value-of">
+		<xsl:param name="select"/>
+		
+	    <!-- We generate too much whitespace rather than risking concatenation -->
+		<axsl:text> </axsl:text>
+		<axsl:value-of select="{$select}"/>
+		<axsl:text> </axsl:text>
+	</xsl:template>
+
+	<!-- default output action: the simplest customization is to just override this -->
+	<xsl:template name="process-message">
+		<xsl:param name="pattern" />
+            <xsl:param name="role" />
+
+		<xsl:apply-templates mode="text"/>	
+		 <xsl:if test=" $message-newline = 'true'" >
+			<axsl:value-of  select="string('&#10;')"/>
+		</xsl:if>
+		
+	</xsl:template>
+</xsl:stylesheet>
+
+
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl
new file mode 100644
index 0000000000000000000000000000000000000000..dae74ff6a2bd56a4f21147905c3a1661baf0b3ab
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl
@@ -0,0 +1,588 @@
+<?xml version="1.0" ?>
+<!-- 
+   ISO_SVRL.xsl   
+
+   Implementation of Schematron Validation Report Language from ISO Schematron
+   ISO/IEC 19757 Document Schema Definition Languages (DSDL) 
+     Part 3: Rule-based validation  Schematron 
+     Annex D: Schematron Validation Report Language 
+
+  This ISO Standard is available free as a Publicly Available Specification in PDF from ISO.
+  Also see www.schematron.com for drafts and other information.
+
+  This implementation of SVRL is designed to run with the "Skeleton" implementation 
+  of Schematron which Oliver Becker devised. The skeleton code provides a 
+  Schematron implementation but with named templates for handling all output; 
+  the skeleton provides basic templates for output using this API, but client
+  validators can be written to import the skeleton and override the default output
+  templates as required. (In order to understand this, you must understand that
+  a named template such as "process-assert" in this XSLT stylesheet overrides and
+  replaces any template with the same name in the imported skeleton XSLT file.)
+
+  The other important thing to understand in this code is that there are different
+  versions of the Schematron skeleton. These track the development of Schematron through
+  Schematron 1.5, Schematron 1.6 and now ISO Schematron. One only skeleton must be
+  imported. The code has templates for the different skeletons commented out for 
+  convenience. ISO Schematron has a different namespace than Schematron 1.5 and 1.6;
+  so the ISO Schematron skeleton has been written itself with an optional import
+  statement to in turn import the Schematron 1.6 skeleton. This will allow you to 
+  validate with schemas from either namespace.
+  
+
+  History:  
+    2009-03-18
+    	* Fix atrribute with space "see " which generates wrong name in some processors
+    2008-08-11
+   		* RJ Fix attribute/@select which saxon allows  in XSLT 1
+   2008-08-07
+    	* RJ Add output-encoding attribute to specify final encoding to use
+    	* Alter allow-foreign functionality so that Schematron span, emph and dir elements make 
+    	  it to the output, for better formatting and because span can be used to mark up
+    	  semantically interesting information embedded in diagnostics, which reduces the
+    	  need to extend SVRL itself
+    	* Diagnostic-reference had an invalid attribute @id that duplicated @diagnostic: removed
+  	2008-08-06
+    	* RJ Fix invalid output:  svrl:diagnostic-reference is not contained in an svrl:text
+    	* Output comment to SVRL file giving filename if available (from command-line parameter)
+  	2008-08-04
+  		* RJ move sch: prefix to schold: prefix to prevent confusion (we want people to
+  		be able to switch from old namespace to new namespace without changing the
+  		sch: prefix, so it is better to keep that prefix completely out of the XSLT)
+  		* Extra signature fixes (PH)
+    2008-08-03
+    	* Repair missing class parameter on process-p
+    2008-07-31
+    	* Update skeleton names
+    2007-04-03 
+    	* Add option generate-fired-rule (RG)
+    2007-02-07
+    	* Prefer true|false for parameters. But allow yes|no on some old for compatability
+    	* DP Diagnostics output to svrl:text. Diagnosis put out after assertion text.
+      	* Removed non-SVRL elements and attributes: better handled as an extra layer that invokes this one
+      	* Add more formal parameters
+      	* Correct confusion between $schemaVersion and $queryBinding
+     	* Indent
+     	* Validate against RNC schemas for XSLT 1 and 2 (with regex tests removed)
+     	* Validate output with UniversalTest.sch against RNC schema for ISO SVRL
+    	
+    2007-02-01
+       	* DP. Update formal parameters of overriding named templates to handle more attributes.
+       	* DP. Refactor handling of rich and linkable parameters to a named template.
+
+    2007-01-22
+    	* DP change svrl:ns to svrl:ns-in-attribute-value
+		* Change default when no queryBinding from "unknown" to "xslt"
+	
+    2007-01-18:
+     	* Improve documentation
+     	* KH Add command-line options to generate paths or not 
+       	* Use axsl:attribute rather than xsl:attribute to shut XSLT2 up
+       	* Add extra command-line options to pass to the iso_schematron_skeleton
+  
+    2006-12-01: iso_svrl.xsl Rick Jelliffe, 
+          * update namespace, 
+          * update phase handling,
+          * add flag param to process-assert and process-report & @ flag on output
+  
+    2001: Conformance1-5.xsl Rick Jelliffe, 
+          * Created, using the skeleton code contributed by Oliver Becker
+-->
+<!--
+ Derived from Conformance1-5.xsl.
+
+ Copyright (c) 2001, 2006 Rick Jelliffe and Academia Sinica Computing Center, Taiwan
+
+ This software is provided 'as-is', without any express or implied warranty. 
+ In no event will the authors be held liable for any damages arising from 
+ the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose, 
+ including commercial applications, and to alter it and redistribute it freely,
+ subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not claim
+ that you wrote the original software. If you use this software in a product, 
+ an acknowledgment in the product documentation would be appreciated but is 
+ not required.
+
+ 2. Altered source versions must be plainly marked as such, and must not be 
+ misrepresented as being the original software.
+
+ 3. This notice may not be removed or altered from any source distribution.
+-->
+
+<!-- Ideas nabbed from schematrons by Francis N., Miloslav N. and David C. -->
+
+<!-- The command-line parameters are:
+  			phase           NMTOKEN | "#ALL" (default) Select the phase for validation
+    		allow-foreign   "true" | "false" (default)   Pass non-Schematron elements  and rich markup  to the generated stylesheet
+            diagnose= true | false|yes|no    Add the diagnostics to the assertion test in reports (yes|no are obsolete)
+            generate-paths=true|false|yes|no   generate the @location attribute with XPaths (yes|no are obsolete)
+            sch.exslt.imports semi-colon delimited string of filenames for some EXSLT implementations          
+   		 optimize        "visit-no-attributes"     Use only when the schema has no attributes as the context nodes
+		 generate-fired-rule "true"(default) | "false"  Generate fired-rule elements
+            
+-->
+
+<xsl:stylesheet
+   version="1.0"
+   xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+	xmlns:xs="http://www.w3.org/2001/XMLSchema"
+   xmlns:axsl="http://www.w3.org/1999/XSL/TransformAlias"
+   xmlns:schold="http://www.ascc.net/xml/schematron" 
+   xmlns:iso="http://purl.oclc.org/dsdl/schematron"
+   xmlns:svrl="http://purl.oclc.org/dsdl/svrl" 
+>
+
+<!-- Select the import statement and adjust the path as 
+   necessary for your system.
+   If not XSLT2 then also remove svrl:active-pattern/@document="{document-uri()}" from process-pattern()
+-->
+<!--
+<xsl:import href="iso_schematron_skeleton_for_saxon.xsl"/>
+--> 
+  
+<xsl:import href="iso_schematron_skeleton_for_xslt1.xsl"/>
+ <!--
+<xsl:import href="iso_schematron_skeleton.xsl"/>
+<xsl:import href="skeleton1-5.xsl"/>
+<xsl:import href="skeleton1-6.xsl"/>
+-->
+
+<xsl:param name="diagnose" >true</xsl:param>
+<xsl:param name="phase" >
+	<xsl:choose>
+		<!-- Handle Schematron 1.5 and 1.6 phases -->
+		<xsl:when test="//schold:schema/@defaultPhase">
+			<xsl:value-of select="//schold:schema/@defaultPhase"/>
+		</xsl:when>
+		<!-- Handle ISO Schematron phases -->
+		<xsl:when test="//iso:schema/@defaultPhase">
+			<xsl:value-of select="//iso:schema/@defaultPhase"/>
+		</xsl:when>
+		<xsl:otherwise>#ALL</xsl:otherwise>
+	</xsl:choose>
+</xsl:param>
+<xsl:param name="allow-foreign" >false</xsl:param>
+<xsl:param name="generate-paths" >true</xsl:param>
+<xsl:param name="generate-fired-rule" >true</xsl:param>
+<xsl:param name="optimize"/>
+
+<xsl:param name="output-encoding" ></xsl:param>
+
+<!-- e.g. saxon file.xml file.xsl "sch.exslt.imports=.../string.xsl;.../math.xsl" -->
+<xsl:param name="sch.exslt.imports" />
+
+
+
+<!-- Experimental: If this file called, then must be generating svrl -->
+<xsl:variable name="svrlTest" select="true()" />
+
+  
+ 
+<!-- ================================================================ -->
+
+<xsl:template name="process-prolog">
+	<axsl:output method="xml" omit-xml-declaration="no" standalone="yes"
+		indent="yes">
+		<xsl:if test=" string-length($output-encoding) &gt; 0">
+			<xsl:attribute name="encoding"><xsl:value-of select=" $output-encoding" /></xsl:attribute>
+		</xsl:if>
+    </axsl:output>
+     
+</xsl:template>
+
+<!-- Overrides skeleton.xsl -->
+<xsl:template name="process-root">
+	<xsl:param name="title"/>
+	<xsl:param name="contents" />
+	<xsl:param name="queryBinding" >xslt1</xsl:param>
+	<xsl:param name="schemaVersion" />
+	<xsl:param name="id" />
+	<xsl:param name="version"/>
+	<!-- "Rich" parameters -->
+	<xsl:param name="fpi" />
+	<xsl:param name="icon" />
+	<xsl:param name="lang" />
+	<xsl:param name="see" />
+	<xsl:param name="space" />
+	
+	<svrl:schematron-output title="{$title}" schemaVersion="{$schemaVersion}" >
+		<xsl:if test=" string-length( normalize-space( $phase )) &gt; 0 and 
+		not( normalize-space( $phase ) = '#ALL') ">
+			<axsl:attribute name="phase">
+				<xsl:value-of select=" $phase " />
+			</axsl:attribute>
+		</xsl:if>
+		<xsl:if test=" $allow-foreign = 'true'">
+		</xsl:if>
+		  <xsl:if  test=" $allow-foreign = 'true'">
+	
+		<xsl:call-template name='richParms'>
+			<xsl:with-param name="fpi" select="$fpi" />
+			<xsl:with-param name="icon" select="$icon"/>
+			<xsl:with-param name="lang" select="$lang"/>
+			<xsl:with-param name="see"  select="$see" />
+			<xsl:with-param name="space"  select="$space" />
+		</xsl:call-template>
+	</xsl:if>
+		 
+		 <axsl:comment><axsl:value-of select="$archiveDirParameter"/>  &#xA0;
+		 <axsl:value-of select="$archiveNameParameter"/> &#xA0;
+		 <axsl:value-of select="$fileNameParameter"/> &#xA0;
+		 <axsl:value-of select="$fileDirParameter"/></axsl:comment> 
+		 
+		
+		<xsl:apply-templates mode="do-schema-p" />
+		<xsl:copy-of select="$contents" />
+	</svrl:schematron-output>
+</xsl:template>
+
+
+<xsl:template name="process-assert">
+	<xsl:param name="test"/>
+	<xsl:param name="diagnostics" />
+	<xsl:param name="id" />
+	<xsl:param name="flag" />
+	<!-- "Linkable" parameters -->
+	<xsl:param name="role"/>
+	<xsl:param name="subject"/>
+	<!-- "Rich" parameters -->
+	<xsl:param name="fpi" />
+	<xsl:param name="icon" />
+	<xsl:param name="lang" />
+	<xsl:param name="see" />
+	<xsl:param name="space" />
+	<svrl:failed-assert test="{$test}" >
+		<xsl:if test="string-length( $id ) &gt; 0">
+			<axsl:attribute name="id">
+				<xsl:value-of select=" $id " />
+			</axsl:attribute>
+		</xsl:if>
+		<xsl:if test=" string-length( $flag ) &gt; 0">
+			<axsl:attribute name="flag">
+				<xsl:value-of select=" $flag " />
+			</axsl:attribute>
+		</xsl:if>
+		<!-- Process rich attributes.  -->
+		<xsl:call-template name="richParms">
+			<xsl:with-param name="fpi" select="$fpi"/>
+			<xsl:with-param name="icon" select="$icon"/>
+			<xsl:with-param name="lang" select="$lang"/>
+			<xsl:with-param name="see" select="$see" />
+			<xsl:with-param name="space" select="$space" />
+		</xsl:call-template>
+		<xsl:call-template name='linkableParms'>
+			<xsl:with-param name="role" select="$role" />
+			<xsl:with-param name="subject" select="$subject"/>
+		</xsl:call-template>
+		<xsl:if test=" $generate-paths = 'true' or $generate-paths= 'yes' ">
+			<!-- true/false is the new way -->
+			<axsl:attribute name="location">
+				<axsl:apply-templates select="." mode="schematron-get-full-path"/>
+			</axsl:attribute>
+		</xsl:if>
+		  
+		<svrl:text>
+			<xsl:apply-templates mode="text" />
+	
+		</svrl:text>
+		    <xsl:if test="$diagnose = 'yes' or $diagnose= 'true' ">
+			<!-- true/false is the new way -->
+				<xsl:call-template name="diagnosticsSplit">
+					<xsl:with-param name="str" select="$diagnostics"/>
+				</xsl:call-template>
+			</xsl:if>
+	</svrl:failed-assert>
+</xsl:template>
+
+<xsl:template name="process-report">
+	<xsl:param name="id"/>
+	<xsl:param name="test"/>
+	<xsl:param name="diagnostics"/>
+	<xsl:param name="flag" />
+	<!-- "Linkable" parameters -->
+	<xsl:param name="role"/>
+	<xsl:param name="subject"/>
+	<!-- "Rich" parameters -->
+	<xsl:param name="fpi" />
+	<xsl:param name="icon" />
+	<xsl:param name="lang" />
+	<xsl:param name="see" />
+	<xsl:param name="space" />
+	<svrl:successful-report test="{$test}" >
+		<xsl:if test=" string-length( $id ) &gt; 0">
+			<axsl:attribute name="id">
+				<xsl:value-of select=" $id " />
+			</axsl:attribute>
+		</xsl:if>
+		<xsl:if test=" string-length( $flag ) &gt; 0">
+			<axsl:attribute name="flag">
+				<xsl:value-of select=" $flag " />
+			</axsl:attribute>
+		</xsl:if>
+		
+		<!-- Process rich attributes.  -->
+		<xsl:call-template name="richParms">
+			<xsl:with-param name="fpi" select="$fpi"/>
+			<xsl:with-param name="icon" select="$icon"/>
+			<xsl:with-param name="lang" select="$lang"/>
+			<xsl:with-param name="see" select="$see" />
+			<xsl:with-param name="space" select="$space" />
+		</xsl:call-template>
+		<xsl:call-template name='linkableParms'>
+			<xsl:with-param name="role" select="$role" />
+			<xsl:with-param name="subject" select="$subject"/>
+		</xsl:call-template>
+		<xsl:if test=" $generate-paths = 'yes' or $generate-paths = 'true' ">
+			<!-- true/false is the new way -->
+			<axsl:attribute name="location">
+				<axsl:apply-templates select="." mode="schematron-get-full-path"/>
+			</axsl:attribute>
+		</xsl:if>
+	 
+		<svrl:text>
+			<xsl:apply-templates mode="text" />
+
+		</svrl:text>
+			<xsl:if test="$diagnose = 'yes' or $diagnose='true' ">
+			<!-- true/false is the new way -->
+				<xsl:call-template name="diagnosticsSplit">
+					<xsl:with-param name="str" select="$diagnostics"/>
+				</xsl:call-template>
+			</xsl:if>
+	</svrl:successful-report>
+</xsl:template>
+
+
+    <!-- Overrides skeleton -->
+	<xsl:template name="process-dir" >
+		<xsl:param name="value" />
+        <xsl:choose>
+        	<xsl:when test=" $allow-foreign = 'true'">
+        		<xsl:copy-of select="."/>
+        	</xsl:when>
+       
+        <xsl:otherwise>
+	    <!-- We generate too much whitespace rather than risking concatenation -->
+		<axsl:text> </axsl:text>
+		<xsl:apply-templates mode="inline-text"/>
+		<axsl:text> </axsl:text>
+		</xsl:otherwise>
+		 </xsl:choose>		
+	</xsl:template>
+
+<xsl:template name="process-diagnostic">
+	<xsl:param name="id"/>
+	<!-- Rich parameters -->
+	<xsl:param name="fpi" />
+	<xsl:param name="icon" />
+	<xsl:param name="lang" />
+	<xsl:param name="see" />
+	<xsl:param name="space" />
+	<svrl:diagnostic-reference diagnostic="{$id}" >
+	  
+		<xsl:call-template name="richParms">
+			<xsl:with-param name="fpi" select="$fpi"/>
+			<xsl:with-param name="icon" select="$icon"/>
+			<xsl:with-param name="lang" select="$lang"/>
+			<xsl:with-param name="see" select="$see" />
+			<xsl:with-param name="space" select="$space" />
+		</xsl:call-template> 
+<xsl:text>
+</xsl:text>
+ 
+		<xsl:apply-templates mode="text"/>
+		 
+	</svrl:diagnostic-reference>
+</xsl:template>
+
+
+    <!-- Overrides skeleton -->
+	<xsl:template name="process-emph" >
+		<xsl:param name="class" />
+        <xsl:choose>
+        	<xsl:when test=" $allow-foreign = 'true'">
+        		<xsl:copy-of select="."/>
+        	</xsl:when> 
+        <xsl:otherwise>
+	    <!-- We generate too much whitespace rather than risking concatenation -->
+		<axsl:text> </axsl:text>
+		<xsl:apply-templates mode="inline-text"/>
+		<axsl:text> </axsl:text>
+		</xsl:otherwise>
+	 	</xsl:choose>	
+	</xsl:template>
+
+<xsl:template name="process-rule">
+	<xsl:param name="id"/>
+	<xsl:param name="context"/>
+	<xsl:param name="flag"/>
+	<!-- "Linkable" parameters -->
+	<xsl:param name="role"/>
+	<xsl:param name="subject"/>
+	<!-- "Rich" parameters -->
+	<xsl:param name="fpi" />
+	<xsl:param name="icon" />
+	<xsl:param name="lang" />
+	<xsl:param name="see" />
+	<xsl:param name="space" />
+	<xsl:if test=" $generate-fired-rule = 'true'">
+	<svrl:fired-rule context="{$context}" >
+		<!-- Process rich attributes.  -->
+		<xsl:call-template name="richParms">
+			<xsl:with-param name="fpi" select="$fpi"/>
+			<xsl:with-param name="icon" select="$icon"/>
+			<xsl:with-param name="lang" select="$lang"/>
+			<xsl:with-param name="see" select="$see" />
+			<xsl:with-param name="space" select="$space" />
+		</xsl:call-template>
+		<xsl:if test=" string( $id )">
+			<xsl:attribute name="id">
+				<xsl:value-of select=" $id " />
+			</xsl:attribute>
+		</xsl:if>
+		<xsl:if test=" string-length( $role ) &gt; 0">
+			<xsl:attribute name="role">
+				<xsl:value-of select=" $role " />
+			</xsl:attribute>
+		</xsl:if> 
+	</svrl:fired-rule>
+</xsl:if>
+</xsl:template>
+
+<xsl:template name="process-ns">
+	<xsl:param name="prefix"/>
+	<xsl:param name="uri"/>
+	<svrl:ns-prefix-in-attribute-values uri="{$uri}" prefix="{$prefix}" />
+</xsl:template>
+
+<xsl:template name="process-p"> 
+	<xsl:param name="icon"/>
+	<xsl:param name="class"/>
+	<xsl:param name="id"/>
+	<xsl:param name="lang"/>
+	 
+	<svrl:text> 
+		<xsl:apply-templates mode="text"/>
+	</svrl:text>
+</xsl:template>
+
+<xsl:template name="process-pattern">
+	<xsl:param name="name"/>
+	<xsl:param name="id"/>
+	<xsl:param name="is-a"/>
+	
+	<!-- "Rich" parameters -->
+	<xsl:param name="fpi" />
+	<xsl:param name="icon" />
+	<xsl:param name="lang" />
+	<xsl:param name="see" />
+	<xsl:param name="space" />
+	<svrl:active-pattern > 
+		<xsl:if test=" string( $id )">
+			<axsl:attribute name="id">
+				<xsl:value-of select=" $id " />
+			</axsl:attribute>
+		</xsl:if>
+		<xsl:if test=" string( $name )">
+			<axsl:attribute name="name">
+				<xsl:value-of select=" $name " />
+			</axsl:attribute>
+		</xsl:if> 
+		 
+		<xsl:call-template name='richParms'>
+			<xsl:with-param name="fpi" select="$fpi"/>
+			<xsl:with-param name="icon" select="$icon"/>
+			<xsl:with-param name="lang" select="$lang"/>
+			<xsl:with-param name="see" select="$see" />
+			<xsl:with-param name="space" select="$space" />
+		</xsl:call-template>
+		
+		<!-- ?? report that this screws up iso:title processing  -->
+		<xsl:apply-templates mode="do-pattern-p"/>
+		<!-- ?? Seems that this apply-templates is never triggered DP -->
+		<axsl:apply-templates />
+	</svrl:active-pattern>
+</xsl:template>
+
+<!-- Overrides skeleton -->
+<xsl:template name="process-message" > 
+	<xsl:param name="pattern"/>
+	<xsl:param name="role"/>
+</xsl:template>
+
+
+    <!-- Overrides skeleton -->
+	<xsl:template name="process-span" >
+		<xsl:param name="class" />
+        <xsl:choose>
+        	<xsl:when test=" $allow-foreign = 'true'">
+        		<xsl:copy-of select="."/>
+        	</xsl:when> 
+        <xsl:otherwise>
+	    <!-- We generate too much whitespace rather than risking concatenation -->
+		<axsl:text> </axsl:text>
+		<xsl:apply-templates mode="inline-text"/>
+		<axsl:text> </axsl:text>
+		</xsl:otherwise>
+	 	</xsl:choose>	
+	</xsl:template>
+
+<!-- =========================================================================== -->
+<!-- processing rich parameters. -->
+<xsl:template name='richParms'>
+	<!-- "Rich" parameters -->
+	<xsl:param name="fpi" />
+	<xsl:param name="icon" />
+	<xsl:param name="lang" />
+	<xsl:param name="see" />
+	<xsl:param name="space" />
+	<!-- Process rich attributes.  -->
+	<xsl:if  test=" $allow-foreign = 'true'">
+	<xsl:if test="string($fpi)"> 
+		<axsl:attribute name="fpi">
+			<xsl:value-of select="$fpi"/>
+		</axsl:attribute>
+	</xsl:if>
+	<xsl:if test="string($icon)"> 
+		<axsl:attribute name="icon">
+			<xsl:value-of select="$icon"/>
+		</axsl:attribute>
+	</xsl:if>
+	<xsl:if test="string($see)"> 
+		<axsl:attribute name="see">
+			<xsl:value-of select="$see"/>
+		</axsl:attribute>
+	</xsl:if>
+	</xsl:if>
+	<xsl:if test="string($space)">
+		<axsl:attribute name="xml:space">
+			<xsl:value-of select="$space"/>
+		</axsl:attribute>
+	</xsl:if>
+	<xsl:if test="string($lang)">
+		<axsl:attribute name="xml:lang">
+			<xsl:value-of select="$lang"/>
+		</axsl:attribute>
+	</xsl:if>
+</xsl:template>
+
+<!-- processing linkable parameters. -->
+<xsl:template name='linkableParms'>
+	<xsl:param name="role"/>
+	<xsl:param name="subject"/>
+	
+	<!-- ISO SVRL has a role attribute to match the Schematron role attribute -->
+	<xsl:if test=" string($role )">
+		<axsl:attribute name="role">
+			<xsl:value-of select=" $role " />
+		</axsl:attribute>
+	</xsl:if>
+	<!-- ISO SVRL does not have a subject attribute to match the Schematron subject attribute.
+       Instead, the Schematron subject attribute is folded into the location attribute -->
+</xsl:template>
+   
+
+</xsl:stylesheet>
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e5d6dfcd9e9c2787c32d98b0063a1fa1ec3236ad
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt
@@ -0,0 +1,84 @@
+ISO SCHEMATRON 2010
+
+XSLT implementation by Rick Jelliffe with assistance from members of Schematron-love-in maillist.
+
+2010-04-21
+
+Two distributions are available. One is for XSLT1 engines. 
+The other is for XSLT2 engines, such as SAXON 9.
+
+
+This version of Schematron splits the process into a pipeline of several different XSLT stages.
+
+1) First, preprocess your Schematron schema with iso_dsdl_include.xsl.  
+This is a macro processor to assemble the schema from various parts. 
+If your schema is not in separate parts, you can skip this stage.
+This stage also generates error messages for some common XPath syntax problems.
+
+2) Second, preprocess the output from stage 1 with iso_abstract_expand.xsl.  
+This is a macro processor to convert abstract patterns to real patterns. 
+If your schema does not use abstract patterns, you can skip this
+stage.
+
+3) Third, compile the Schematron schema into an XSLT script. 
+This will typically use iso_svrl_for_xslt1.xsl or iso_svrl_for_xslt2.xsl 
+(which in turn invoke iso_schematron_skeleton_for_xslt1.xsl or iso_schematron_skeleton_for_saxon.xsl)
+However, other "meta-stylesheets" are also in common use; the principle of operation is the same.
+If your schema uses Schematron phases, supply these as command line/invocation parameters
+to this process.
+
+4) Fourth, run the script generated by stage 3 against the document being validated.
+If you are using the SVRL script, then the output of validation will be an XML document.
+If your schema uses Schematron parameters, supply these as command line/invocation parameters
+to this process. 
+
+
+The XSLT2 distribution also features several next generation features, 
+such as validating multiple documents. See the source code for details.
+
+Schematron assertions can be written in any language, of course; the file
+sch-messages-en.xhtml contains the diagnostics messages from the XSLT2 skeleton
+in English, and this can be used as template to localize the skeleton's
+error messages. Note that typically programming errors in Schematron are XPath
+errors, which requires localized messages from the XSLT engine.
+
+ANT
+---
+To give an example of how to process a document, here is a sample ANT task.
+
+<target  name="schematron-compile-test" >
+
+	   <!-- expand inclusions -->
+	   <xslt basedir="test/schematron"
+	   		style="iso_dsdl_include.xsl" in="test.sch"  out="test1.sch"> 
+	   				<classpath>
+	   					<pathelement location="${lib.dir}/saxon9.jar"/>
+	   				</classpath>
+	   </xslt>
+
+	   <!-- expand abstract patterns -->
+	   <xslt basedir="test/schematron"
+	   		style="iso_abstract_expand.xsl" in="test1.sch"  out="test2.sch"> 
+	   				<classpath>
+	   					<pathelement location="${lib.dir}/saxon9.jar"/>
+	   				</classpath>
+	   </xslt>
+
+
+
+	   <!-- compile it -->
+	   <xslt basedir="test/schematron"
+	   		style="iso_svrl_for_xslt2.xsl" in="test2.sch"  out="test.xsl"> 
+	   				<classpath>
+	   					<pathelement location="${lib.dir}/saxon9.jar"/>
+	   				</classpath>
+	   </xslt>
+	   
+	   <!-- validate -->
+	   <xslt basedir="test/schematron"
+		   		style="test.xsl" in="instance.xml"  out="instance.svrlt"> 
+		   				<classpath>
+		   					<pathelement location="${lib.dir}/saxon9.jar"/>
+		   				</classpath>
+	</xslt>
+		</target>
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2441f8fa39ee4b43a36644da847825b8b795c232
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b35162aa894b8ef93271eeed3337fe812ed25a2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/glue.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/glue.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..330872134db456f01cbda7e3c7539bf7bfa440e9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/glue.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/language_modeling.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/language_modeling.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ff620c8819707d2f7620274c1aed949d6bd4dbd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/language_modeling.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/squad.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/squad.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8362645998854a295cd55e5a0f8a86970ac38c40
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/datasets/__pycache__/squad.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebd0d17aa55bb4529820ce347f6275d38f6c0caa
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__init__.py
@@ -0,0 +1,98 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+from ...utils import is_sklearn_available, requires_backends
+
+
+if is_sklearn_available():
+    from scipy.stats import pearsonr, spearmanr
+    from sklearn.metrics import f1_score, matthews_corrcoef
+
+
+DEPRECATION_WARNING = (
+    "This metric will be removed from the library soon, metrics should be handled with the 🤗 Evaluate "
+    "library. You can have a look at this example script for pointers: "
+    "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
+)
+
+
+def simple_accuracy(preds, labels):
+    warnings.warn(DEPRECATION_WARNING, FutureWarning)
+    requires_backends(simple_accuracy, "sklearn")
+    return (preds == labels).mean()
+
+
+def acc_and_f1(preds, labels):
+    warnings.warn(DEPRECATION_WARNING, FutureWarning)
+    requires_backends(acc_and_f1, "sklearn")
+    acc = simple_accuracy(preds, labels)
+    f1 = f1_score(y_true=labels, y_pred=preds)
+    return {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+
+
+def pearson_and_spearman(preds, labels):
+    warnings.warn(DEPRECATION_WARNING, FutureWarning)
+    requires_backends(pearson_and_spearman, "sklearn")
+    pearson_corr = pearsonr(preds, labels)[0]
+    spearman_corr = spearmanr(preds, labels)[0]
+    return {
+        "pearson": pearson_corr,
+        "spearmanr": spearman_corr,
+        "corr": (pearson_corr + spearman_corr) / 2,
+    }
+
+
+def glue_compute_metrics(task_name, preds, labels):
+    warnings.warn(DEPRECATION_WARNING, FutureWarning)
+    requires_backends(glue_compute_metrics, "sklearn")
+    assert len(preds) == len(labels), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}"
+    if task_name == "cola":
+        return {"mcc": matthews_corrcoef(labels, preds)}
+    elif task_name == "sst-2":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "mrpc":
+        return acc_and_f1(preds, labels)
+    elif task_name == "sts-b":
+        return pearson_and_spearman(preds, labels)
+    elif task_name == "qqp":
+        return acc_and_f1(preds, labels)
+    elif task_name == "mnli":
+        return {"mnli/acc": simple_accuracy(preds, labels)}
+    elif task_name == "mnli-mm":
+        return {"mnli-mm/acc": simple_accuracy(preds, labels)}
+    elif task_name == "qnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "rte":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "wnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "hans":
+        return {"acc": simple_accuracy(preds, labels)}
+    else:
+        raise KeyError(task_name)
+
+
+def xnli_compute_metrics(task_name, preds, labels):
+    warnings.warn(DEPRECATION_WARNING, FutureWarning)
+    requires_backends(xnli_compute_metrics, "sklearn")
+    if len(preds) != len(labels):
+        raise ValueError(f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}")
+    if task_name == "xnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    else:
+        raise KeyError(task_name)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..392ea7f745d95e81d4afcc21161c531199b3b4cd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__pycache__/squad_metrics.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__pycache__/squad_metrics.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6628a5c73a789682cff3f8f5bf4a617cf8b617e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/__pycache__/squad_metrics.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/squad_metrics.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/squad_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ffc025b65a0451523004df12f5a4ae5e9d17b9a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/metrics/squad_metrics.py
@@ -0,0 +1,779 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was modified by XLNet authors to
+update `find_best_threshold` scripts for SQuAD V2.0
+
+In addition to basic functionality, we also compute additional statistics and plot precision-recall curves if an
+additional na_prob.json file is provided. This file is expected to map question ID's to the model's predicted
+probability that a question is unanswerable.
+"""
+
+import collections
+import json
+import math
+import re
+import string
+
+from ...models.bert import BasicTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+
+
+def compute_exact(a_gold, a_pred):
+    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+
+
+def compute_f1(a_gold, a_pred):
+    gold_toks = get_tokens(a_gold)
+    pred_toks = get_tokens(a_pred)
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def get_raw_scores(examples, preds):
+    """
+    Computes the exact and f1 scores from the examples and the model predictions
+    """
+    exact_scores = {}
+    f1_scores = {}
+
+    for example in examples:
+        qas_id = example.qas_id
+        gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])]
+
+        if not gold_answers:
+            # For unanswerable questions, only correct answer is empty string
+            gold_answers = [""]
+
+        if qas_id not in preds:
+            print(f"Missing prediction for {qas_id}")
+            continue
+
+        prediction = preds[qas_id]
+        exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
+        f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
+
+    return exact_scores, f1_scores
+
+
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
+    new_scores = {}
+    for qid, s in scores.items():
+        pred_na = na_probs[qid] > na_prob_thresh
+        if pred_na:
+            new_scores[qid] = float(not qid_to_has_ans[qid])
+        else:
+            new_scores[qid] = s
+    return new_scores
+
+
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
+    if not qid_list:
+        total = len(exact_scores)
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores.values()) / total),
+                ("f1", 100.0 * sum(f1_scores.values()) / total),
+                ("total", total),
+            ]
+        )
+    else:
+        total = len(qid_list)
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+                ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+                ("total", total),
+            ]
+        )
+
+
+def merge_eval(main_eval, new_eval, prefix):
+    for k in new_eval:
+        main_eval[f"{prefix}_{k}"] = new_eval[k]
+
+
+def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for qid in qid_list:
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
+            else:
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+
+    has_ans_score, has_ans_cnt = 0, 0
+    for qid in qid_list:
+        if not qid_to_has_ans[qid]:
+            continue
+        has_ans_cnt += 1
+
+        if qid not in scores:
+            continue
+        has_ans_score += scores[qid]
+
+    return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
+
+
+def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
+    main_eval["has_ans_exact"] = has_ans_exact
+    main_eval["has_ans_f1"] = has_ans_f1
+
+
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for _, qid in enumerate(qid_list):
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
+            else:
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+    return 100.0 * best_score / len(scores), best_thresh
+
+
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
+
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
+
+
+def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
+    qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
+    has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
+    no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]
+
+    if no_answer_probs is None:
+        no_answer_probs = dict.fromkeys(preds, 0.0)
+
+    exact, f1 = get_raw_scores(examples, preds)
+
+    exact_threshold = apply_no_ans_threshold(
+        exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold
+    )
+    f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)
+
+    evaluation = make_eval_dict(exact_threshold, f1_threshold)
+
+    if has_answer_qids:
+        has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
+        merge_eval(evaluation, has_ans_eval, "HasAns")
+
+    if no_answer_qids:
+        no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
+        merge_eval(evaluation, no_ans_eval, "NoAns")
+
+    if no_answer_probs:
+        find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)
+
+    return evaluation
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heuristic between
+    # `pred_text` and `orig_text` to get a character-to-character alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for i, c in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose_logging:
+            logger.info(f"Unable to find text: '{pred_text}' in '{orig_text}'")
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose_logging:
+            logger.info(f"Length not equal after stripping spaces: '{orig_ns_text}' vs '{tok_ns_text}'")
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for i, tok_index in tok_ns_to_s_map.items():
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
+
+
+def compute_predictions_logits(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    do_lower_case,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    verbose_logging,
+    version_2_with_negative,
+    null_score_diff_threshold,
+    tokenizer,
+):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    if output_prediction_file:
+        logger.info(f"Writing predictions to: {output_prediction_file}")
+    if output_nbest_file:
+        logger.info(f"Writing nbest to: {output_nbest_file}")
+    if output_null_log_odds_file and version_2_with_negative:
+        logger.info(f"Writing null_log_odds to: {output_null_log_odds_file}")
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+    )
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for example_index, example in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min null score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for feature_index, feature in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index],
+                        )
+                    )
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit,
+                )
+            )
+        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"]
+        )
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
+
+                tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+                # tok_text = " ".join(tok_tokens)
+                #
+                # # De-tokenize WordPieces that have been split off.
+                # tok_text = tok_text.replace(" ##", "")
+                # tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+
+                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
+        # if we didn't include the empty option in the n-best, include it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
+
+            # In very rare edge cases we could only have single null prediction.
+            # So we just create a nonce prediction in this case to avoid failure.
+            if len(nbest) == 1:
+                nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        if len(nbest) < 1:
+            raise ValueError("No valid predictions")
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for i, entry in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        if len(nbest_json) < 1:
+            raise ValueError("No valid predictions")
+
+        if not version_2_with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+        all_nbest_json[example.qas_id] = nbest_json
+
+    if output_prediction_file:
+        with open(output_prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    if output_nbest_file:
+        with open(output_nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if output_null_log_odds_file and version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
+
+
+def compute_predictions_log_probs(
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    start_n_top,
+    end_n_top,
+    version_2_with_negative,
+    tokenizer,
+    verbose_logging,
+):
+    """
+    XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of
+    null if needed.
+
+    Requires utils_squad_evaluate.py
+    """
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
+    )
+
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
+    )
+
+    logger.info(f"Writing predictions to: {output_prediction_file}")
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for example_index, example in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+
+        for feature_index, feature in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+
+            cur_null_score = result.cls_logits
+
+            # if we could have irrelevant answers, get the min score of irrelevant
+            score_null = min(score_null, cur_null_score)
+
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_log_prob = result.start_logits[i]
+                    start_index = result.start_top_index[i]
+
+                    j_index = i * end_n_top + j
+
+                    end_log_prob = result.end_logits[j_index]
+                    end_index = result.end_top_index[j_index]
+
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= feature.paragraph_len - 1:
+                        continue
+                    if end_index >= feature.paragraph_len - 1:
+                        continue
+
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_log_prob=start_log_prob,
+                            end_log_prob=end_log_prob,
+                        )
+                    )
+
+        prelim_predictions = sorted(
+            prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True
+        )
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+
+            # XLNet un-tokenizer
+            # Let's keep it simple for now and see if we need all this later.
+            #
+            # tok_start_to_orig_index = feature.tok_start_to_orig_index
+            # tok_end_to_orig_index = feature.tok_end_to_orig_index
+            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
+            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
+            # paragraph_text = example.paragraph_text
+            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
+
+            # Previously used Bert untokenizer
+            tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]
+            orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
+            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+            # Clean whitespace
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+
+            if hasattr(tokenizer, "do_lower_case"):
+                do_lower_case = tokenizer.do_lower_case
+            else:
+                do_lower_case = tokenizer.do_lowercase_and_remove_accent
+
+            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+
+            if final_text in seen_predictions:
+                continue
+
+            seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)
+            )
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(_NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6))
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_log_prob + entry.end_log_prob)
+            if not best_non_null_entry:
+                best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for i, entry in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_log_prob"] = entry.start_log_prob
+            output["end_log_prob"] = entry.end_log_prob
+            nbest_json.append(output)
+
+        if len(nbest_json) < 1:
+            raise ValueError("No valid predictions")
+        if best_non_null_entry is None:
+            raise ValueError("No valid predictions")
+
+        score_diff = score_null
+        scores_diff_json[example.qas_id] = score_diff
+        # note(zhiliny): always predict best_non_null_entry
+        # and the evaluation script will search for the best threshold
+        all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a26ab5776d74715428b10c4d9cd943e53b253785
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
+from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
+from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
+from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68e97a656acfe188f942bb93470e803ea9270796
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/glue.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/glue.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4ed7f71bbd9d4bb468883d751fa8f8fb6c85a2f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/glue.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/squad.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/squad.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0aeb1d0d8d25ed1ceda3ebfab617afd3ecd460ed
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/squad.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5b682fee961c0dbb7513218015ffe027f9e0d64
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/xnli.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/xnli.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b5c36e536c48b4fe62a9e3798468f5ba8000ffe
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/__pycache__/xnli.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/glue.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/glue.py
new file mode 100644
index 0000000000000000000000000000000000000000..e005c9bcda13d15bc3aa32a50c79941166d0ba28
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/glue.py
@@ -0,0 +1,643 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GLUE processors and helpers"""
+
+import os
+import warnings
+from dataclasses import asdict
+from enum import Enum
+from typing import Optional, Union
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import is_tf_available, logging
+from .utils import DataProcessor, InputExample, InputFeatures
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+logger = logging.get_logger(__name__)
+
+DEPRECATION_WARNING = (
+    "This {0} will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
+    "library. You can have a look at this example script for pointers: "
+    "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
+)
+
+
+def glue_convert_examples_to_features(
+    examples: Union[list[InputExample], "tf.data.Dataset"],
+    tokenizer: PreTrainedTokenizer,
+    max_length: Optional[int] = None,
+    task=None,
+    label_list=None,
+    output_mode=None,
+):
+    """
+    Loads a data file into a list of `InputFeatures`
+
+    Args:
+        examples: List of `InputExamples` or `tf.data.Dataset` containing the examples.
+        tokenizer: Instance of a tokenizer that will tokenize the examples
+        max_length: Maximum example length. Defaults to the tokenizer's max_len
+        task: GLUE task
+        label_list: List of labels. Can be obtained from the processor using the `processor.get_labels()` method
+        output_mode: String indicating the output mode. Either `regression` or `classification`
+
+    Returns:
+        If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the task-specific
+        features. If the input is a list of `InputExamples`, will return a list of task-specific `InputFeatures` which
+        can be fed to the model.
+
+    """
+    warnings.warn(DEPRECATION_WARNING.format("function"), FutureWarning)
+    if is_tf_available() and isinstance(examples, tf.data.Dataset):
+        if task is None:
+            raise ValueError("When calling glue_convert_examples_to_features from TF, the task parameter is required.")
+        return _tf_glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
+    return _glue_convert_examples_to_features(
+        examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode
+    )
+
+
+if is_tf_available():
+
+    def _tf_glue_convert_examples_to_features(
+        examples: tf.data.Dataset,
+        tokenizer: PreTrainedTokenizer,
+        task=str,
+        max_length: Optional[int] = None,
+    ) -> tf.data.Dataset:
+        """
+        Returns:
+            A `tf.data.Dataset` containing the task-specific features.
+
+        """
+        processor = glue_processors[task]()
+        examples = [processor.tfds_map(processor.get_example_from_tensor_dict(example)) for example in examples]
+        features = glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
+        label_type = tf.float32 if task == "sts-b" else tf.int64
+
+        def gen():
+            for ex in features:
+                d = {k: v for k, v in asdict(ex).items() if v is not None}
+                label = d.pop("label")
+                yield (d, label)
+
+        input_names = tokenizer.model_input_names
+
+        return tf.data.Dataset.from_generator(
+            gen,
+            (dict.fromkeys(input_names, tf.int32), label_type),
+            ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
+        )
+
+
+def _glue_convert_examples_to_features(
+    examples: list[InputExample],
+    tokenizer: PreTrainedTokenizer,
+    max_length: Optional[int] = None,
+    task=None,
+    label_list=None,
+    output_mode=None,
+):
+    if max_length is None:
+        max_length = tokenizer.model_max_length
+
+    if task is not None:
+        processor = glue_processors[task]()
+        if label_list is None:
+            label_list = processor.get_labels()
+            logger.info(f"Using label list {label_list} for task {task}")
+        if output_mode is None:
+            output_mode = glue_output_modes[task]
+            logger.info(f"Using output mode {output_mode} for task {task}")
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    def label_from_example(example: InputExample) -> Union[int, float, None]:
+        if example.label is None:
+            return None
+        if output_mode == "classification":
+            return label_map[example.label]
+        elif output_mode == "regression":
+            return float(example.label)
+        raise KeyError(output_mode)
+
+    labels = [label_from_example(example) for example in examples]
+
+    batch_encoding = tokenizer(
+        [(example.text_a, example.text_b) for example in examples],
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
+    )
+
+    features = []
+    for i in range(len(examples)):
+        inputs = {k: batch_encoding[k][i] for k in batch_encoding}
+
+        feature = InputFeatures(**inputs, label=labels[i])
+        features.append(feature)
+
+    for i, example in enumerate(examples[:5]):
+        logger.info("*** Example ***")
+        logger.info(f"guid: {example.guid}")
+        logger.info(f"features: {features[i]}")
+
+    return features
+
+
+class OutputMode(Enum):
+    classification = "classification"
+    regression = "regression"
+
+
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {os.path.join(data_dir, 'train.tsv')}")
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{i}"
+            text_a = line[3]
+            text_b = line[4]
+            label = None if set_type == "test" else line[0]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["premise"].numpy().decode("utf-8"),
+            tensor_dict["hypothesis"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test_matched")
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            text_a = line[8]
+            text_b = line[9]
+            label = None if set_type.startswith("test") else line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class MnliMismatchedProcessor(MnliProcessor):
+    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_mismatched")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_mismatched.tsv")), "test_mismatched")
+
+
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence"].numpy().decode("utf-8"),
+            None,
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        test_mode = set_type == "test"
+        if test_mode:
+            lines = lines[1:]
+        text_index = 1 if test_mode else 3
+        examples = []
+        for i, line in enumerate(lines):
+            guid = f"{set_type}-{i}"
+            text_a = line[text_index]
+            label = None if test_mode else line[1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class Sst2Processor(DataProcessor):
+    """Processor for the SST-2 data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence"].numpy().decode("utf-8"),
+            None,
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        text_index = 1 if set_type == "test" else 0
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{i}"
+            text_a = line[text_index]
+            label = None if set_type == "test" else line[1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+
+
+class StsbProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return [None]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            text_a = line[7]
+            text_b = line[8]
+            label = None if set_type == "test" else line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QqpProcessor(DataProcessor):
+    """Processor for the QQP data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["question1"].numpy().decode("utf-8"),
+            tensor_dict["question2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        test_mode = set_type == "test"
+        q1_index = 1 if test_mode else 3
+        q2_index = 2 if test_mode else 4
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            try:
+                text_a = line[q1_index]
+                text_b = line[q2_index]
+                label = None if test_mode else line[5]
+            except IndexError:
+                continue
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class QnliProcessor(DataProcessor):
+    """Processor for the QNLI data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["question"].numpy().decode("utf-8"),
+            tensor_dict["sentence"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            text_a = line[1]
+            text_b = line[2]
+            label = None if set_type == "test" else line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class RteProcessor(DataProcessor):
+    """Processor for the RTE data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            text_a = line[1]
+            text_b = line[2]
+            label = None if set_type == "test" else line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+class WnliProcessor(DataProcessor):
+    """Processor for the WNLI data set (GLUE version)."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(
+            tensor_dict["idx"].numpy(),
+            tensor_dict["sentence1"].numpy().decode("utf-8"),
+            tensor_dict["sentence2"].numpy().decode("utf-8"),
+            str(tensor_dict["label"].numpy()),
+        )
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training, dev and test sets."""
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"{set_type}-{line[0]}"
+            text_a = line[1]
+            text_b = line[2]
+            label = None if set_type == "test" else line[-1]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+
+glue_tasks_num_labels = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
+
+glue_processors = {
+    "cola": ColaProcessor,
+    "mnli": MnliProcessor,
+    "mnli-mm": MnliMismatchedProcessor,
+    "mrpc": MrpcProcessor,
+    "sst-2": Sst2Processor,
+    "sts-b": StsbProcessor,
+    "qqp": QqpProcessor,
+    "qnli": QnliProcessor,
+    "rte": RteProcessor,
+    "wnli": WnliProcessor,
+}
+
+glue_output_modes = {
+    "cola": "classification",
+    "mnli": "classification",
+    "mnli-mm": "classification",
+    "mrpc": "classification",
+    "sst-2": "classification",
+    "sts-b": "regression",
+    "qqp": "classification",
+    "qnli": "classification",
+    "rte": "classification",
+    "wnli": "classification",
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/squad.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/squad.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f37eb01813308a0c850e55ac283f13ccf231f68
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/squad.py
@@ -0,0 +1,845 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from functools import partial
+from multiprocessing import Pool, cpu_count
+from multiprocessing.pool import ThreadPool
+from typing import Optional
+
+import numpy as np
+from tqdm import tqdm
+
+from ...models.bert.tokenization_bert import whitespace_tokenize
+from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy
+from ...utils import is_tf_available, is_torch_available, is_torch_hpu_available, logging
+from .utils import DataProcessor
+
+
+# Store the tokenizers which insert 2 separators tokens
+MULTI_SEP_TOKENS_TOKENIZERS_SET = {"roberta", "camembert", "bart", "mpnet"}
+
+
+if is_torch_available():
+    import torch
+    from torch.utils.data import TensorDataset
+
+if is_tf_available():
+    import tensorflow as tf
+
+logger = logging.get_logger(__name__)
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    best_score = None
+    best_span_index = None
+    for span_index, doc_span in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _new_check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    # if len(doc_spans) == 1:
+    # return True
+    best_score = None
+    best_span_index = None
+    for span_index, doc_span in enumerate(doc_spans):
+        end = doc_span["start"] + doc_span["length"] - 1
+        if position < doc_span["start"]:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span["start"]
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _is_whitespace(c):
+    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+        return True
+    return False
+
+
+def squad_convert_example_to_features(
+    example, max_seq_length, doc_stride, max_query_length, padding_strategy, is_training
+):
+    features = []
+    if is_training and not example.is_impossible:
+        # Get start and end position
+        start_position = example.start_position
+        end_position = example.end_position
+
+        # If the answer cannot be found in the text, then skip this example.
+        actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
+        cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
+        if actual_text.find(cleaned_answer_text) == -1:
+            logger.warning(f"Could not find answer: '{actual_text}' vs. '{cleaned_answer_text}'")
+            return []
+
+    tok_to_orig_index = []
+    orig_to_tok_index = []
+    all_doc_tokens = []
+    for i, token in enumerate(example.doc_tokens):
+        orig_to_tok_index.append(len(all_doc_tokens))
+        if tokenizer.__class__.__name__ in [
+            "RobertaTokenizer",
+            "LongformerTokenizer",
+            "BartTokenizer",
+            "RobertaTokenizerFast",
+            "LongformerTokenizerFast",
+            "BartTokenizerFast",
+        ]:
+            sub_tokens = tokenizer.tokenize(token, add_prefix_space=True)
+        else:
+            sub_tokens = tokenizer.tokenize(token)
+        for sub_token in sub_tokens:
+            tok_to_orig_index.append(i)
+            all_doc_tokens.append(sub_token)
+
+    if is_training and not example.is_impossible:
+        tok_start_position = orig_to_tok_index[example.start_position]
+        if example.end_position < len(example.doc_tokens) - 1:
+            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+        else:
+            tok_end_position = len(all_doc_tokens) - 1
+
+        (tok_start_position, tok_end_position) = _improve_answer_span(
+            all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
+        )
+
+    spans = []
+
+    truncated_query = tokenizer.encode(
+        example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
+    )
+
+    # Tokenizers who insert 2 SEP tokens in-between <context> & <question> need to have special handling
+    # in the way they compute mask of added tokens.
+    tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
+    sequence_added_tokens = (
+        tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1
+        if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
+        else tokenizer.model_max_length - tokenizer.max_len_single_sentence
+    )
+    sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair
+
+    span_doc_tokens = all_doc_tokens
+    while len(spans) * doc_stride < len(all_doc_tokens):
+        # Define the side we want to truncate / pad and the text/pair sorting
+        if tokenizer.padding_side == "right":
+            texts = truncated_query
+            pairs = span_doc_tokens
+            truncation = TruncationStrategy.ONLY_SECOND.value
+        else:
+            texts = span_doc_tokens
+            pairs = truncated_query
+            truncation = TruncationStrategy.ONLY_FIRST.value
+
+        encoded_dict = tokenizer.encode_plus(  # TODO(thom) update this logic
+            texts,
+            pairs,
+            truncation=truncation,
+            padding=padding_strategy,
+            max_length=max_seq_length,
+            return_overflowing_tokens=True,
+            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
+            return_token_type_ids=True,
+        )
+
+        paragraph_len = min(
+            len(all_doc_tokens) - len(spans) * doc_stride,
+            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
+        )
+
+        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
+            if tokenizer.padding_side == "right":
+                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
+            else:
+                last_padding_id_position = (
+                    len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id)
+                )
+                non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :]
+
+        else:
+            non_padded_ids = encoded_dict["input_ids"]
+
+        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
+
+        token_to_orig_map = {}
+        for i in range(paragraph_len):
+            index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
+            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
+
+        encoded_dict["paragraph_len"] = paragraph_len
+        encoded_dict["tokens"] = tokens
+        encoded_dict["token_to_orig_map"] = token_to_orig_map
+        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
+        encoded_dict["token_is_max_context"] = {}
+        encoded_dict["start"] = len(spans) * doc_stride
+        encoded_dict["length"] = paragraph_len
+
+        spans.append(encoded_dict)
+
+        if "overflowing_tokens" not in encoded_dict or (
+            "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0
+        ):
+            break
+        span_doc_tokens = encoded_dict["overflowing_tokens"]
+
+    for doc_span_index in range(len(spans)):
+        for j in range(spans[doc_span_index]["paragraph_len"]):
+            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
+            index = (
+                j
+                if tokenizer.padding_side == "left"
+                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+            )
+            spans[doc_span_index]["token_is_max_context"][index] = is_max_context
+
+    for span in spans:
+        # Identify the position of the CLS token
+        cls_index = span["input_ids"].index(tokenizer.cls_token_id)
+
+        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+        # Original TF implementation also keep the classification token (set to 0)
+        p_mask = np.ones_like(span["token_type_ids"])
+        if tokenizer.padding_side == "right":
+            p_mask[len(truncated_query) + sequence_added_tokens :] = 0
+        else:
+            p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0
+
+        pad_token_indices = np.where(np.atleast_1d(span["input_ids"] == tokenizer.pad_token_id))
+        special_token_indices = np.asarray(
+            tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True)
+        ).nonzero()
+
+        p_mask[pad_token_indices] = 1
+        p_mask[special_token_indices] = 1
+
+        # Set the cls index to 0: the CLS index can be used for impossible answers
+        p_mask[cls_index] = 0
+
+        span_is_impossible = example.is_impossible
+        start_position = 0
+        end_position = 0
+        if is_training and not span_is_impossible:
+            # For training, if our document chunk does not contain an annotation
+            # we throw it out, since there is nothing to predict.
+            doc_start = span["start"]
+            doc_end = span["start"] + span["length"] - 1
+            out_of_span = False
+
+            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
+                out_of_span = True
+
+            if out_of_span:
+                start_position = cls_index
+                end_position = cls_index
+                span_is_impossible = True
+            else:
+                if tokenizer.padding_side == "left":
+                    doc_offset = 0
+                else:
+                    doc_offset = len(truncated_query) + sequence_added_tokens
+
+                start_position = tok_start_position - doc_start + doc_offset
+                end_position = tok_end_position - doc_start + doc_offset
+        features.append(
+            SquadFeatures(
+                span["input_ids"],
+                span["attention_mask"],
+                span["token_type_ids"],
+                cls_index,
+                p_mask.tolist(),
+                example_index=0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
+                unique_id=0,
+                paragraph_len=span["paragraph_len"],
+                token_is_max_context=span["token_is_max_context"],
+                tokens=span["tokens"],
+                token_to_orig_map=span["token_to_orig_map"],
+                start_position=start_position,
+                end_position=end_position,
+                is_impossible=span_is_impossible,
+                qas_id=example.qas_id,
+            )
+        )
+    return features
+
+
+def squad_convert_example_to_features_init(tokenizer_for_convert: PreTrainedTokenizerBase):
+    global tokenizer
+    tokenizer = tokenizer_for_convert
+
+
+def squad_convert_examples_to_features(
+    examples,
+    tokenizer,
+    max_seq_length,
+    doc_stride,
+    max_query_length,
+    is_training,
+    padding_strategy="max_length",
+    return_dataset=False,
+    threads=1,
+    tqdm_enabled=True,
+):
+    """
+    Converts a list of examples into a list of features that can be directly given as input to a model. It is
+    model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
+
+    Args:
+        examples: list of [`~data.processors.squad.SquadExample`]
+        tokenizer: an instance of a child of [`PreTrainedTokenizer`]
+        max_seq_length: The maximum sequence length of the inputs.
+        doc_stride: The stride used when the context is too large and is split across several features.
+        max_query_length: The maximum length of the query.
+        is_training: whether to create features for model evaluation or model training.
+        padding_strategy: Default to "max_length". Which padding strategy to use
+        return_dataset: Default False. Either 'pt' or 'tf'.
+            if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset
+        threads: multiple processing threads.
+
+
+    Returns:
+        list of [`~data.processors.squad.SquadFeatures`]
+
+    Example:
+
+    ```python
+    processor = SquadV2Processor()
+    examples = processor.get_dev_examples(data_dir)
+
+    features = squad_convert_examples_to_features(
+        examples=examples,
+        tokenizer=tokenizer,
+        max_seq_length=args.max_seq_length,
+        doc_stride=args.doc_stride,
+        max_query_length=args.max_query_length,
+        is_training=not evaluate,
+    )
+    ```"""
+
+    threads = min(threads, cpu_count())
+    pool_cls = ThreadPool if is_torch_hpu_available() else Pool
+    with pool_cls(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
+        annotate_ = partial(
+            squad_convert_example_to_features,
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+            max_query_length=max_query_length,
+            padding_strategy=padding_strategy,
+            is_training=is_training,
+        )
+        features = list(
+            tqdm(
+                p.imap(annotate_, examples, chunksize=32),
+                total=len(examples),
+                desc="convert squad examples to features",
+                disable=not tqdm_enabled,
+            )
+        )
+
+    new_features = []
+    unique_id = 1000000000
+    example_index = 0
+    for example_features in tqdm(
+        features, total=len(features), desc="add example index and unique id", disable=not tqdm_enabled
+    ):
+        if not example_features:
+            continue
+        for example_feature in example_features:
+            example_feature.example_index = example_index
+            example_feature.unique_id = unique_id
+            new_features.append(example_feature)
+            unique_id += 1
+        example_index += 1
+    features = new_features
+    del new_features
+    if return_dataset == "pt":
+        if not is_torch_available():
+            raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.")
+
+        # Convert to Tensors and build dataset
+        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
+        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+        all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
+
+        if not is_training:
+            all_feature_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+            dataset = TensorDataset(
+                all_input_ids, all_attention_masks, all_token_type_ids, all_feature_index, all_cls_index, all_p_mask
+            )
+        else:
+            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
+            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+            dataset = TensorDataset(
+                all_input_ids,
+                all_attention_masks,
+                all_token_type_ids,
+                all_start_positions,
+                all_end_positions,
+                all_cls_index,
+                all_p_mask,
+                all_is_impossible,
+            )
+
+        return features, dataset
+    elif return_dataset == "tf":
+        if not is_tf_available():
+            raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.")
+
+        def gen():
+            for i, ex in enumerate(features):
+                if ex.token_type_ids is None:
+                    yield (
+                        {
+                            "input_ids": ex.input_ids,
+                            "attention_mask": ex.attention_mask,
+                            "feature_index": i,
+                            "qas_id": ex.qas_id,
+                        },
+                        {
+                            "start_positions": ex.start_position,
+                            "end_positions": ex.end_position,
+                            "cls_index": ex.cls_index,
+                            "p_mask": ex.p_mask,
+                            "is_impossible": ex.is_impossible,
+                        },
+                    )
+                else:
+                    yield (
+                        {
+                            "input_ids": ex.input_ids,
+                            "attention_mask": ex.attention_mask,
+                            "token_type_ids": ex.token_type_ids,
+                            "feature_index": i,
+                            "qas_id": ex.qas_id,
+                        },
+                        {
+                            "start_positions": ex.start_position,
+                            "end_positions": ex.end_position,
+                            "cls_index": ex.cls_index,
+                            "p_mask": ex.p_mask,
+                            "is_impossible": ex.is_impossible,
+                        },
+                    )
+
+        # Why have we split the batch into a tuple? PyTorch just has a list of tensors.
+        if "token_type_ids" in tokenizer.model_input_names:
+            train_types = (
+                {
+                    "input_ids": tf.int32,
+                    "attention_mask": tf.int32,
+                    "token_type_ids": tf.int32,
+                    "feature_index": tf.int64,
+                    "qas_id": tf.string,
+                },
+                {
+                    "start_positions": tf.int64,
+                    "end_positions": tf.int64,
+                    "cls_index": tf.int64,
+                    "p_mask": tf.int32,
+                    "is_impossible": tf.int32,
+                },
+            )
+
+            train_shapes = (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "token_type_ids": tf.TensorShape([None]),
+                    "feature_index": tf.TensorShape([]),
+                    "qas_id": tf.TensorShape([]),
+                },
+                {
+                    "start_positions": tf.TensorShape([]),
+                    "end_positions": tf.TensorShape([]),
+                    "cls_index": tf.TensorShape([]),
+                    "p_mask": tf.TensorShape([None]),
+                    "is_impossible": tf.TensorShape([]),
+                },
+            )
+        else:
+            train_types = (
+                {"input_ids": tf.int32, "attention_mask": tf.int32, "feature_index": tf.int64, "qas_id": tf.string},
+                {
+                    "start_positions": tf.int64,
+                    "end_positions": tf.int64,
+                    "cls_index": tf.int64,
+                    "p_mask": tf.int32,
+                    "is_impossible": tf.int32,
+                },
+            )
+
+            train_shapes = (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "feature_index": tf.TensorShape([]),
+                    "qas_id": tf.TensorShape([]),
+                },
+                {
+                    "start_positions": tf.TensorShape([]),
+                    "end_positions": tf.TensorShape([]),
+                    "cls_index": tf.TensorShape([]),
+                    "p_mask": tf.TensorShape([None]),
+                    "is_impossible": tf.TensorShape([]),
+                },
+            )
+
+        return tf.data.Dataset.from_generator(gen, train_types, train_shapes)
+    else:
+        return features
+
+
+class SquadProcessor(DataProcessor):
+    """
+    Processor for the SQuAD data set. overridden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
+    version 2.0 of SQuAD, respectively.
+    """
+
+    train_file = None
+    dev_file = None
+
+    def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
+        if not evaluate:
+            answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
+            answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
+            answers = []
+        else:
+            answers = [
+                {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
+                for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
+            ]
+
+            answer = None
+            answer_start = None
+
+        return SquadExample(
+            qas_id=tensor_dict["id"].numpy().decode("utf-8"),
+            question_text=tensor_dict["question"].numpy().decode("utf-8"),
+            context_text=tensor_dict["context"].numpy().decode("utf-8"),
+            answer_text=answer,
+            start_position_character=answer_start,
+            title=tensor_dict["title"].numpy().decode("utf-8"),
+            answers=answers,
+        )
+
+    def get_examples_from_dataset(self, dataset, evaluate=False):
+        """
+        Creates a list of [`~data.processors.squad.SquadExample`] using a TFDS dataset.
+
+        Args:
+            dataset: The tfds dataset loaded from *tensorflow_datasets.load("squad")*
+            evaluate: Boolean specifying if in evaluation mode or in training mode
+
+        Returns:
+            List of SquadExample
+
+        Examples:
+
+        ```python
+        >>> import tensorflow_datasets as tfds
+
+        >>> dataset = tfds.load("squad")
+
+        >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
+        >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
+        ```"""
+
+        if evaluate:
+            dataset = dataset["validation"]
+        else:
+            dataset = dataset["train"]
+
+        examples = []
+        for tensor_dict in tqdm(dataset):
+            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
+
+        return examples
+
+    def get_train_examples(self, data_dir, filename=None):
+        """
+        Returns the training examples from the data directory.
+
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the training file has a different name than the original one
+                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+
+        """
+        if data_dir is None:
+            data_dir = ""
+
+        if self.train_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+
+        with open(
+            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "train")
+
+    def get_dev_examples(self, data_dir, filename=None):
+        """
+        Returns the evaluation example from the data directory.
+
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the evaluation file has a different name than the original one
+                which is `dev-v1.1.json` and `dev-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+        """
+        if data_dir is None:
+            data_dir = ""
+
+        if self.dev_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+
+        with open(
+            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "dev")
+
+    def _create_examples(self, input_data, set_type):
+        is_training = set_type == "train"
+        examples = []
+        for entry in tqdm(input_data):
+            title = entry["title"]
+            for paragraph in entry["paragraphs"]:
+                context_text = paragraph["context"]
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question_text = qa["question"]
+                    start_position_character = None
+                    answer_text = None
+                    answers = []
+
+                    is_impossible = qa.get("is_impossible", False)
+                    if not is_impossible:
+                        if is_training:
+                            answer = qa["answers"][0]
+                            answer_text = answer["text"]
+                            start_position_character = answer["answer_start"]
+                        else:
+                            answers = qa["answers"]
+
+                    example = SquadExample(
+                        qas_id=qas_id,
+                        question_text=question_text,
+                        context_text=context_text,
+                        answer_text=answer_text,
+                        start_position_character=start_position_character,
+                        title=title,
+                        is_impossible=is_impossible,
+                        answers=answers,
+                    )
+                    examples.append(example)
+        return examples
+
+
+class SquadV1Processor(SquadProcessor):
+    train_file = "train-v1.1.json"
+    dev_file = "dev-v1.1.json"
+
+
+class SquadV2Processor(SquadProcessor):
+    train_file = "train-v2.0.json"
+    dev_file = "dev-v2.0.json"
+
+
+class SquadExample:
+    """
+    A single training/test example for the Squad dataset, as loaded from disk.
+
+    Args:
+        qas_id: The example's unique identifier
+        question_text: The question string
+        context_text: The context string
+        answer_text: The answer string
+        start_position_character: The character position of the start of the answer
+        title: The title of the example
+        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
+        is_impossible: False by default, set to True if the example has no possible answer.
+    """
+
+    def __init__(
+        self,
+        qas_id,
+        question_text,
+        context_text,
+        answer_text,
+        start_position_character,
+        title,
+        answers=[],
+        is_impossible=False,
+    ):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.context_text = context_text
+        self.answer_text = answer_text
+        self.title = title
+        self.is_impossible = is_impossible
+        self.answers = answers
+
+        self.start_position, self.end_position = 0, 0
+
+        doc_tokens = []
+        char_to_word_offset = []
+        prev_is_whitespace = True
+
+        # Split on whitespace so that different tokens may be attributed to their original position.
+        for c in self.context_text:
+            if _is_whitespace(c):
+                prev_is_whitespace = True
+            else:
+                if prev_is_whitespace:
+                    doc_tokens.append(c)
+                else:
+                    doc_tokens[-1] += c
+                prev_is_whitespace = False
+            char_to_word_offset.append(len(doc_tokens) - 1)
+
+        self.doc_tokens = doc_tokens
+        self.char_to_word_offset = char_to_word_offset
+
+        # Start and end positions only has a value during evaluation.
+        if start_position_character is not None and not is_impossible:
+            self.start_position = char_to_word_offset[start_position_character]
+            self.end_position = char_to_word_offset[
+                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
+            ]
+
+
+class SquadFeatures:
+    """
+    Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
+    [`~data.processors.squad.SquadExample`] using the
+    :method:*~transformers.data.processors.squad.squad_convert_examples_to_features* method.
+
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        cls_index: the index of the CLS token.
+        p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
+            Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
+        example_index: the index of the example
+        unique_id: The unique Feature identifier
+        paragraph_len: The length of the context
+        token_is_max_context:
+            List of booleans identifying which tokens have their maximum context in this feature object. If a token
+            does not have their maximum context in this feature object, it means that another feature object has more
+            information related to that token and should be prioritized over this feature for that token.
+        tokens: list of tokens corresponding to the input ids
+        token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
+        start_position: start of the answer token index
+        end_position: end of the answer token index
+        encoding: optionally store the BatchEncoding with the fast-tokenizer alignment methods.
+    """
+
+    def __init__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        cls_index,
+        p_mask,
+        example_index,
+        unique_id,
+        paragraph_len,
+        token_is_max_context,
+        tokens,
+        token_to_orig_map,
+        start_position,
+        end_position,
+        is_impossible,
+        qas_id: Optional[str] = None,
+        encoding: Optional[BatchEncoding] = None,
+    ):
+        self.input_ids = input_ids
+        self.attention_mask = attention_mask
+        self.token_type_ids = token_type_ids
+        self.cls_index = cls_index
+        self.p_mask = p_mask
+
+        self.example_index = example_index
+        self.unique_id = unique_id
+        self.paragraph_len = paragraph_len
+        self.token_is_max_context = token_is_max_context
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+        self.qas_id = qas_id
+
+        self.encoding = encoding
+
+
+class SquadResult:
+    """
+    Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
+
+    Args:
+        unique_id: The unique identifier corresponding to that example.
+        start_logits: The logits corresponding to the start of the answer
+        end_logits: The logits corresponding to the end of the answer
+    """
+
+    def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
+        self.start_logits = start_logits
+        self.end_logits = end_logits
+        self.unique_id = unique_id
+
+        if start_top_index:
+            self.start_top_index = start_top_index
+            self.end_top_index = end_top_index
+            self.cls_logits = cls_logits
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..462156ebac384e08d78a7b42ea06f35a457e5feb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/utils.py
@@ -0,0 +1,349 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import dataclasses
+import json
+from dataclasses import dataclass
+from typing import Optional, Union
+
+from ...utils import is_tf_available, is_torch_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class InputExample:
+    """
+    A single training/test example for simple sequence classification.
+
+    Args:
+        guid: Unique id for the example.
+        text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+        text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+        label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+    """
+
+    guid: str
+    text_a: str
+    text_b: Optional[str] = None
+    label: Optional[str] = None
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(dataclasses.asdict(self), indent=2) + "\n"
+
+
+@dataclass(frozen=True)
+class InputFeatures:
+    """
+    A single set of features of data. Property names are the same names as the corresponding inputs to a model.
+
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+            Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
+            tokens.
+        token_type_ids: (Optional) Segment token indices to indicate first and second
+            portions of the inputs. Only some models use them.
+        label: (Optional) Label corresponding to the input. Int for classification problems,
+            float for regression problems.
+    """
+
+    input_ids: list[int]
+    attention_mask: Optional[list[int]] = None
+    token_type_ids: Optional[list[int]] = None
+    label: Optional[Union[int, float]] = None
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(dataclasses.asdict(self)) + "\n"
+
+
+class DataProcessor:
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """
+        Gets an example from a dict with tensorflow tensors.
+
+        Args:
+            tensor_dict: Keys and values should match the corresponding Glue
+                tensorflow_dataset examples.
+        """
+        raise NotImplementedError()
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of [`InputExample`] for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of [`InputExample`] for the dev set."""
+        raise NotImplementedError()
+
+    def get_test_examples(self, data_dir):
+        """Gets a collection of [`InputExample`] for the test set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    def tfds_map(self, example):
+        """
+        Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
+        examples to the correct format.
+        """
+        if len(self.get_labels()) > 1:
+            example.label = self.get_labels()[int(example.label)]
+        return example
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8-sig") as f:
+            return list(csv.reader(f, delimiter="\t", quotechar=quotechar))
+
+
+class SingleSentenceClassificationProcessor(DataProcessor):
+    """Generic processor for a single sentence classification data set."""
+
+    def __init__(self, labels=None, examples=None, mode="classification", verbose=False):
+        self.labels = [] if labels is None else labels
+        self.examples = [] if examples is None else examples
+        self.mode = mode
+        self.verbose = verbose
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            return SingleSentenceClassificationProcessor(labels=self.labels, examples=self.examples[idx])
+        return self.examples[idx]
+
+    @classmethod
+    def create_from_csv(
+        cls, file_name, split_name="", column_label=0, column_text=1, column_id=None, skip_first_row=False, **kwargs
+    ):
+        processor = cls(**kwargs)
+        processor.add_examples_from_csv(
+            file_name,
+            split_name=split_name,
+            column_label=column_label,
+            column_text=column_text,
+            column_id=column_id,
+            skip_first_row=skip_first_row,
+            overwrite_labels=True,
+            overwrite_examples=True,
+        )
+        return processor
+
+    @classmethod
+    def create_from_examples(cls, texts_or_text_and_labels, labels=None, **kwargs):
+        processor = cls(**kwargs)
+        processor.add_examples(texts_or_text_and_labels, labels=labels)
+        return processor
+
+    def add_examples_from_csv(
+        self,
+        file_name,
+        split_name="",
+        column_label=0,
+        column_text=1,
+        column_id=None,
+        skip_first_row=False,
+        overwrite_labels=False,
+        overwrite_examples=False,
+    ):
+        lines = self._read_tsv(file_name)
+        if skip_first_row:
+            lines = lines[1:]
+        texts = []
+        labels = []
+        ids = []
+        for i, line in enumerate(lines):
+            texts.append(line[column_text])
+            labels.append(line[column_label])
+            if column_id is not None:
+                ids.append(line[column_id])
+            else:
+                guid = f"{split_name}-{i}" if split_name else str(i)
+                ids.append(guid)
+
+        return self.add_examples(
+            texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples
+        )
+
+    def add_examples(
+        self, texts_or_text_and_labels, labels=None, ids=None, overwrite_labels=False, overwrite_examples=False
+    ):
+        if labels is not None and len(texts_or_text_and_labels) != len(labels):
+            raise ValueError(
+                f"Text and labels have mismatched lengths {len(texts_or_text_and_labels)} and {len(labels)}"
+            )
+        if ids is not None and len(texts_or_text_and_labels) != len(ids):
+            raise ValueError(f"Text and ids have mismatched lengths {len(texts_or_text_and_labels)} and {len(ids)}")
+        if ids is None:
+            ids = [None] * len(texts_or_text_and_labels)
+        if labels is None:
+            labels = [None] * len(texts_or_text_and_labels)
+        examples = []
+        added_labels = set()
+        for text_or_text_and_label, label, guid in zip(texts_or_text_and_labels, labels, ids):
+            if isinstance(text_or_text_and_label, (tuple, list)) and label is None:
+                text, label = text_or_text_and_label
+            else:
+                text = text_or_text_and_label
+            added_labels.add(label)
+            examples.append(InputExample(guid=guid, text_a=text, text_b=None, label=label))
+
+        # Update examples
+        if overwrite_examples:
+            self.examples = examples
+        else:
+            self.examples.extend(examples)
+
+        # Update labels
+        if overwrite_labels:
+            self.labels = list(added_labels)
+        else:
+            self.labels = list(set(self.labels).union(added_labels))
+
+        return self.examples
+
+    def get_features(
+        self,
+        tokenizer,
+        max_length=None,
+        pad_on_left=False,
+        pad_token=0,
+        mask_padding_with_zero=True,
+        return_tensors=None,
+    ):
+        """
+        Convert examples in a list of `InputFeatures`
+
+        Args:
+            tokenizer: Instance of a tokenizer that will tokenize the examples
+            max_length: Maximum example length
+            pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
+            pad_token: Padding token
+            mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
+                and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for actual
+                values)
+
+        Returns:
+            If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
+            task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
+            `InputFeatures` which can be fed to the model.
+
+        """
+        if max_length is None:
+            max_length = tokenizer.max_len
+
+        label_map = {label: i for i, label in enumerate(self.labels)}
+
+        all_input_ids = []
+        for ex_index, example in enumerate(self.examples):
+            if ex_index % 10000 == 0:
+                logger.info(f"Tokenizing example {ex_index}")
+
+            input_ids = tokenizer.encode(
+                example.text_a,
+                add_special_tokens=True,
+                max_length=min(max_length, tokenizer.max_len),
+            )
+            all_input_ids.append(input_ids)
+
+        batch_length = max(len(input_ids) for input_ids in all_input_ids)
+
+        features = []
+        for ex_index, (input_ids, example) in enumerate(zip(all_input_ids, self.examples)):
+            if ex_index % 10000 == 0:
+                logger.info(f"Writing example {ex_index}/{len(self.examples)}")
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding_length = batch_length - len(input_ids)
+            if pad_on_left:
+                input_ids = ([pad_token] * padding_length) + input_ids
+                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
+            else:
+                input_ids = input_ids + ([pad_token] * padding_length)
+                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+
+            if len(input_ids) != batch_length:
+                raise ValueError(f"Error with input length {len(input_ids)} vs {batch_length}")
+            if len(attention_mask) != batch_length:
+                raise ValueError(f"Error with input length {len(attention_mask)} vs {batch_length}")
+
+            if self.mode == "classification":
+                label = label_map[example.label]
+            elif self.mode == "regression":
+                label = float(example.label)
+            else:
+                raise ValueError(self.mode)
+
+            if ex_index < 5 and self.verbose:
+                logger.info("*** Example ***")
+                logger.info(f"guid: {example.guid}")
+                logger.info(f"input_ids: {' '.join([str(x) for x in input_ids])}")
+                logger.info(f"attention_mask: {' '.join([str(x) for x in attention_mask])}")
+                logger.info(f"label: {example.label} (id = {label})")
+
+            features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label))
+
+        if return_tensors is None:
+            return features
+        elif return_tensors == "tf":
+            if not is_tf_available():
+                raise RuntimeError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported")
+            import tensorflow as tf
+
+            def gen():
+                for ex in features:
+                    yield ({"input_ids": ex.input_ids, "attention_mask": ex.attention_mask}, ex.label)
+
+            dataset = tf.data.Dataset.from_generator(
+                gen,
+                ({"input_ids": tf.int32, "attention_mask": tf.int32}, tf.int64),
+                ({"input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None])}, tf.TensorShape([])),
+            )
+            return dataset
+        elif return_tensors == "pt":
+            if not is_torch_available():
+                raise RuntimeError("return_tensors set to 'pt' but PyTorch can't be imported")
+            import torch
+            from torch.utils.data import TensorDataset
+
+            all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+            all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+            if self.mode == "classification":
+                all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+            elif self.mode == "regression":
+                all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+
+            dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels)
+            return dataset
+        else:
+            raise ValueError("return_tensors should be one of 'tf' or 'pt'")
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/xnli.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/xnli.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d8ec17a8345db5bf08325a334a4c6eb8af29157
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/data/processors/xnli.py
@@ -0,0 +1,96 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""XNLI utils (dataset loading and evaluation)"""
+
+import os
+
+from ...utils import logging
+from .utils import DataProcessor, InputExample
+
+
+logger = logging.get_logger(__name__)
+
+
+class XnliProcessor(DataProcessor):
+    """
+    Processor for the XNLI dataset. Adapted from
+    https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207
+    """
+
+    def __init__(self, language, train_language=None):
+        self.language = language
+        self.train_language = train_language
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        lg = self.language if self.train_language is None else self.train_language
+        lines = self._read_tsv(os.path.join(data_dir, f"XNLI-MT-1.0/multinli/multinli.train.{lg}.tsv"))
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            guid = f"train-{i}"
+            text_a = line[0]
+            text_b = line[1]
+            label = "contradiction" if line[2] == "contradictory" else line[2]
+            if not isinstance(text_a, str):
+                raise TypeError(f"Training input {text_a} is not a string")
+            if not isinstance(text_b, str):
+                raise TypeError(f"Training input {text_b} is not a string")
+            if not isinstance(label, str):
+                raise TypeError(f"Training label {label} is not a string")
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv"))
+        examples = []
+        for i, line in enumerate(lines):
+            if i == 0:
+                continue
+            language = line[0]
+            if language != self.language:
+                continue
+            guid = f"test-{i}"
+            text_a = line[6]
+            text_b = line[7]
+            label = line[1]
+            if not isinstance(text_a, str):
+                raise TypeError(f"Training input {text_a} is not a string")
+            if not isinstance(text_b, str):
+                raise TypeError(f"Training input {text_b} is not a string")
+            if not isinstance(label, str):
+                raise TypeError(f"Training label {label} is not a string")
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+
+
+xnli_processors = {
+    "xnli": XnliProcessor,
+}
+
+xnli_output_modes = {
+    "xnli": "classification",
+}
+
+xnli_tasks_num_labels = {
+    "xnli": 3,
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/distributed/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/distributed/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4fa8a11d5fc5819fd8e9d2949940aa754a2a281
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/distributed/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/distributed/__pycache__/configuration_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/distributed/__pycache__/configuration_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0f3795a70e24e399f06f63fd0733cad6b30e823
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/distributed/__pycache__/configuration_utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19cfc2c83632a743c04a624ddd2e00f50c629a73
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cpu/ms_deform_attn_cpu.cpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cpu/ms_deform_attn_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..388a73d22d4c9b561e2a887b50a1897b8cf2def9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cpu/ms_deform_attn_cpu.cpp
@@ -0,0 +1,40 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cpu/ms_deform_attn_cpu.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cpu/ms_deform_attn_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..7eac8c8bcd1bf529bb9c13d54d2d4215c9e4c89f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cpu/ms_deform_attn_cpu.h
@@ -0,0 +1,32 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.cu b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8ea1d7fabe2684dbb85f00fae2c47b469687cb2c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.cu
@@ -0,0 +1,156 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+#include "cuda/ms_deform_im2col_cuda.cuh"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#pragma once
+#include <torch/extension.h>
+
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data<int64_t>(),
+                level_start_index.data<int64_t>(),
+                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data<scalar_t>());
+
+        }));
+    }
+
+    output = output.view({batch, num_query, num_heads*channels});
+
+    return output;
+}
+
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data<scalar_t>(),
+                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data<int64_t>(),
+                                    level_start_index.data<int64_t>(),
+                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+
+        }));
+    }
+
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..34f8ae9cb77bbaa8cb4dd25e0cb86632db9ad05d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.cuh
@@ -0,0 +1,1467 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THCAtomics.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data<int64_t>(),
+                level_start_index.data<int64_t>(),
+                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data<scalar_t>());
+
+        }));
+    }
+
+    output = output.view({batch, num_query, num_heads*channels});
+
+    return output;
+}
+
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data<scalar_t>(),
+                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data<int64_t>(),
+                                    level_start_index.data<int64_t>(),
+                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+
+        }));
+    }
+
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+  return (N + num_threads - 1) / num_threads;
+}
+
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val); 
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(const int n,
+                                                const scalar_t *data_value, 
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+    
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockSize/2; s>0; s>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        { 
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            } 
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear_gm(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream,
+                              const scalar_t* data_value,
+                              const int64_t* data_spatial_shapes, 
+                              const int64_t* data_level_start_index, 
+                              const scalar_t* data_sampling_loc,
+                              const scalar_t* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size, 
+                              const int num_heads, 
+                              const int channels, 
+                              const int num_levels, 
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+          0, stream>>>(
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+  
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(cudaStream_t stream,
+                              const scalar_t* grad_col,
+                              const scalar_t* data_value,
+                              const int64_t * data_spatial_shapes,
+                              const int64_t * data_level_start_index,
+                              const scalar_t * data_sampling_loc,
+                              const scalar_t * data_attn_weight,
+                              const int batch_size, 
+                              const int spatial_size, 
+                              const int num_heads,
+                              const int channels, 
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point, 
+                              scalar_t* grad_value,
+                              scalar_t* grad_sampling_loc,
+                              scalar_t* grad_attn_weight)
+{
+  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024)
+  {
+    if ((channels & 1023) == 0)
+    {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+    }
+    else
+    {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+    }
+  }
+  else{
+    switch(channels)
+    {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      default:
+        if (channels < 64)
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+        else
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbcf4543e66bb1162f42ce2ae57e1bac92243cb4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_attn_cuda.h
@@ -0,0 +1,29 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_im2col_cuda.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_im2col_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..c0db0c88c9db2c09d7f601937ea0f6ac480913bf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/cuda/ms_deform_im2col_cuda.cuh
@@ -0,0 +1,1327 @@
+/*!
+**************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************
+* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
+* Copyright (c) 2018 Microsoft
+**************************************************************************
+*/
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THCAtomics.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+  return (N + num_threads - 1) / num_threads;
+}
+
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val); 
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(const int n,
+                                                const scalar_t *data_value, 
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+    
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockSize/2; s>0; s>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        { 
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            } 
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear_gm(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream,
+                              const scalar_t* data_value,
+                              const int64_t* data_spatial_shapes, 
+                              const int64_t* data_level_start_index, 
+                              const scalar_t* data_sampling_loc,
+                              const scalar_t* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size, 
+                              const int num_heads, 
+                              const int channels, 
+                              const int num_levels, 
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+          0, stream>>>(
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+  
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(cudaStream_t stream,
+                              const scalar_t* grad_col,
+                              const scalar_t* data_value,
+                              const int64_t * data_spatial_shapes,
+                              const int64_t * data_level_start_index,
+                              const scalar_t * data_sampling_loc,
+                              const scalar_t * data_attn_weight,
+                              const int batch_size, 
+                              const int spatial_size, 
+                              const int num_heads,
+                              const int channels, 
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point, 
+                              scalar_t* grad_value,
+                              scalar_t* grad_sampling_loc,
+                              scalar_t* grad_attn_weight)
+{
+  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024)
+  {
+    if ((channels & 1023) == 0)
+    {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+    }
+    else
+    {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+    }
+  }
+  else{
+    switch(channels)
+    {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      default:
+        if (channels < 64)
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+        else
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/ms_deform_attn.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/ms_deform_attn.h
new file mode 100644
index 0000000000000000000000000000000000000000..119b1fa317d1e5fcfb61a4837e560e9248db05f3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/ms_deform_attn.h
@@ -0,0 +1,61 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+
+#include "cpu/ms_deform_attn_cpu.h"
+
+#ifdef WITH_CUDA
+#include "cuda/ms_deform_attn_cuda.h"
+#endif
+
+
+at::Tensor
+ms_deform_attn_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_forward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_backward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/vision.cpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/vision.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6ce3875568b9ba8d660c90acc805077cca98f891
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/deta/vision.cpp
@@ -0,0 +1,16 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "ms_deform_attn.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
+}
\ No newline at end of file
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..da88e3394f653369a7443245c67dcbe57f2ed23e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2024 Tri Dao, Albert Gu, Technological Innovation Institute and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .selective_scan_with_ln_interface import mamba_inner_fn
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4949778aaf6f0ec7de038aa63970457b44f254a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__pycache__/selective_scan_with_ln_interface.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__pycache__/selective_scan_with_ln_interface.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8103fa82787a7da29d05d64a74c480d9646fc77
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/__pycache__/selective_scan_with_ln_interface.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a74986a81a13f9428eab353de5b61a4d101972d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py
@@ -0,0 +1,525 @@
+# coding=utf-8
+# Copyright 2024 Tri Dao, Albert Gu, Technological Innovation Institute and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Original code from: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/selective_scan_interface.py
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+
+try:
+    import causal_conv1d_cuda
+except ImportError:
+    causal_conv1d_cuda = None
+
+import mamba_ssm
+import selective_scan_cuda
+
+
+# For BC for old mamba-ssm versions: https://github.com/huggingface/transformers/pull/33195#discussion_r1736401127
+if hasattr(mamba_ssm.ops.triton, "layernorm"):
+    from mamba_ssm.ops.triton.layernorm import _layer_norm_fwd
+else:
+    from mamba_ssm.ops.triton.layer_norm import _layer_norm_fwd
+
+
+class SelectiveScanFn(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx, u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False, return_last_state=False
+    ):
+        if u.stride(-1) != 1:
+            u = u.contiguous()
+        if delta.stride(-1) != 1:
+            delta = delta.contiguous()
+        if D is not None:
+            D = D.contiguous()
+        if B.stride(-1) != 1:
+            B = B.contiguous()
+        if C.stride(-1) != 1:
+            C = C.contiguous()
+        if z is not None and z.stride(-1) != 1:
+            z = z.contiguous()
+        if B.dim() == 3:
+            B = rearrange(B, "b dstate l -> b 1 dstate l")
+            ctx.squeeze_B = True
+        if C.dim() == 3:
+            C = rearrange(C, "b dstate l -> b 1 dstate l")
+            ctx.squeeze_C = True
+        out, x, *rest = selective_scan_cuda.fwd(u, delta, A, B, C, D, z, delta_bias, delta_softplus)
+        ctx.delta_softplus = delta_softplus
+        ctx.has_z = z is not None
+        last_state = x[:, :, -1, 1::2]  # (batch, dim, dstate)
+        if not ctx.has_z:
+            ctx.save_for_backward(u, delta, A, B, C, D, delta_bias, x)
+            return out if not return_last_state else (out, last_state)
+        else:
+            ctx.save_for_backward(u, delta, A, B, C, D, z, delta_bias, x, out)
+            out_z = rest[0]
+            return out_z if not return_last_state else (out_z, last_state)
+
+    @staticmethod
+    def backward(ctx, dout, *args):
+        if not ctx.has_z:
+            u, delta, A, B, C, D, delta_bias, x = ctx.saved_tensors
+            z = None
+            out = None
+        else:
+            u, delta, A, B, C, D, z, delta_bias, x, out = ctx.saved_tensors
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
+        # backward of selective_scan_cuda with the backward of chunk).
+        # Here we just pass in None and dz will be allocated in the C++ code.
+        du, ddelta, dA, dB, dC, dD, ddelta_bias, *rest = selective_scan_cuda.bwd(
+            u,
+            delta,
+            A,
+            B,
+            C,
+            D,
+            z,
+            delta_bias,
+            dout,
+            x,
+            out,
+            None,
+            ctx.delta_softplus,
+            False,  # option to recompute out_z, not used here
+        )
+        dz = rest[0] if ctx.has_z else None
+        dB = dB.squeeze(1) if getattr(ctx, "squeeze_B", False) else dB
+        dC = dC.squeeze(1) if getattr(ctx, "squeeze_C", False) else dC
+        return (
+            du,
+            ddelta,
+            dA,
+            dB,
+            dC,
+            dD if D is not None else None,
+            dz,
+            ddelta_bias if delta_bias is not None else None,
+            None,
+            None,
+        )
+
+
+def rms_norm_forward(
+    x,
+    weight,
+    bias,
+    eps=1e-6,
+    is_rms_norm=True,
+):
+    # x (b l) d
+    if x.stride(-1) != 1:
+        x = x.contiguous()
+    weight = weight.contiguous()
+    if bias is not None:
+        bias = bias.contiguous()
+    y = _layer_norm_fwd(x, weight, bias, eps, None, residual_dtype=None, is_rms_norm=is_rms_norm)[0]
+    # y (b l) d
+    return y
+
+
+def selective_scan_fn(
+    u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False, return_last_state=False
+):
+    """if return_last_state is True, returns (out, last_state)
+    last_state has shape (batch, dim, dstate). Note that the gradient of the last state is
+    not considered in the backward pass.
+    """
+    return SelectiveScanFn.apply(u, delta, A, B, C, D, z, delta_bias, delta_softplus, return_last_state)
+
+
+def selective_scan_ref(
+    u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False, return_last_state=False
+):
+    """
+    u: r(B D L)
+    delta: r(B D L)
+    A: c(D N) or r(D N)
+    B: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
+    C: c(D N) or r(B N L) or r(B N 2L) or r(B G N L) or (B G N L)
+    D: r(D)
+    z: r(B D L)
+    delta_bias: r(D), fp32
+
+    out: r(B D L)
+    last_state (optional): r(B D dstate) or c(B D dstate)
+    """
+    dtype_in = u.dtype
+    u = u.float()
+    delta = delta.float()
+    if delta_bias is not None:
+        delta = delta + delta_bias[..., None].float()
+    if delta_softplus:
+        delta = F.softplus(delta)
+    batch, dim, dstate = u.shape[0], A.shape[0], A.shape[1]
+    is_variable_B = B.dim() >= 3
+    is_variable_C = C.dim() >= 3
+    if A.is_complex():
+        if is_variable_B:
+            B = torch.view_as_complex(rearrange(B.float(), "... (L two) -> ... L two", two=2))
+        if is_variable_C:
+            C = torch.view_as_complex(rearrange(C.float(), "... (L two) -> ... L two", two=2))
+    else:
+        B = B.float()
+        C = C.float()
+    x = A.new_zeros((batch, dim, dstate))
+    ys = []
+    deltaA = torch.exp(torch.einsum("bdl,dn->bdln", delta, A))
+    if not is_variable_B:
+        deltaB_u = torch.einsum("bdl,dn,bdl->bdln", delta, B, u)
+    else:
+        if B.dim() == 3:
+            deltaB_u = torch.einsum("bdl,bnl,bdl->bdln", delta, B, u)
+        else:
+            B = repeat(B, "B G N L -> B (G H) N L", H=dim // B.shape[1])
+            deltaB_u = torch.einsum("bdl,bdnl,bdl->bdln", delta, B, u)
+    if is_variable_C and C.dim() == 4:
+        C = repeat(C, "B G N L -> B (G H) N L", H=dim // C.shape[1])
+    last_state = None
+    for i in range(u.shape[2]):
+        x = deltaA[:, :, i] * x + deltaB_u[:, :, i]
+        if not is_variable_C:
+            y = torch.einsum("bdn,dn->bd", x, C)
+        else:
+            if C.dim() == 3:
+                y = torch.einsum("bdn,bn->bd", x, C[:, :, i])
+            else:
+                y = torch.einsum("bdn,bdn->bd", x, C[:, :, :, i])
+        if i == u.shape[2] - 1:
+            last_state = x
+        if y.is_complex():
+            y = y.real * 2
+        ys.append(y)
+    y = torch.stack(ys, dim=2)  # (batch dim L)
+    out = y if D is None else y + u * rearrange(D, "d -> d 1")
+    if z is not None:
+        out = out * F.silu(z)
+    out = out.to(dtype=dtype_in)
+    return out if not return_last_state else (out, last_state)
+
+
+class MambaInnerFn(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd
+    def forward(
+        ctx,
+        xz,
+        conv1d_weight,
+        conv1d_bias,
+        x_proj_weight,
+        delta_proj_weight,
+        out_proj_weight,
+        out_proj_bias,
+        A,
+        B=None,
+        C=None,
+        D=None,
+        delta_bias=None,
+        B_proj_bias=None,
+        C_proj_bias=None,
+        delta_softplus=True,
+        checkpoint_lvl=1,
+        b_rms_weight=None,
+        c_rms_weight=None,
+        dt_rms_weight=None,
+        b_c_dt_rms_eps=1e-6,
+    ):
+        """
+        xz: (batch, dim, seqlen)
+        """
+        assert causal_conv1d_cuda is not None, "causal_conv1d_cuda is not available. Please install causal-conv1d."
+        assert checkpoint_lvl in [0, 1]
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        if torch.is_autocast_enabled():
+            x_proj_weight = x_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            delta_proj_weight = delta_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            out_proj_weight = out_proj_weight.to(dtype=torch.get_autocast_gpu_dtype())
+            out_proj_bias = (
+                out_proj_bias.to(dtype=torch.get_autocast_gpu_dtype()) if out_proj_bias is not None else None
+            )
+        if xz.stride(-1) != 1:
+            xz = xz.contiguous()
+        conv1d_weight = rearrange(conv1d_weight, "d 1 w -> d w")
+        x, z = xz.chunk(2, dim=1)
+        conv1d_bias = conv1d_bias.contiguous() if conv1d_bias is not None else None
+        conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, None, None, None, True)
+        # We're being very careful here about the layout, to avoid extra transposes.
+        # We want delta to have d as the slowest moving dimension
+        # and L as the fastest moving dimension, since those are what the ssm_scan kernel expects.
+        x_dbl = F.linear(rearrange(conv1d_out, "b d l -> (b l) d"), x_proj_weight)  # (bl d)
+        delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l=L)
+        ctx.is_variable_B = B is None
+        ctx.is_variable_C = C is None
+        ctx.B_proj_bias_is_None = B_proj_bias is None
+        ctx.C_proj_bias_is_None = C_proj_bias is None
+        if B is None:  # variable B
+            B = x_dbl[:, delta_rank : delta_rank + d_state]  # (bl dstate)
+            if B_proj_bias is not None:
+                B = B + B_proj_bias.to(dtype=B.dtype)
+            if not A.is_complex():
+                # B = rearrange(B, "(b l) dstate -> b dstate l", l=L).contiguous()
+                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                B = rearrange(B, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if B.stride(-1) != 1:
+                B = B.contiguous()
+        if C is None:  # variable C
+            C = x_dbl[:, -d_state:]  # (bl dstate)
+            if C_proj_bias is not None:
+                C = C + C_proj_bias.to(dtype=C.dtype)
+            if not A.is_complex():
+                # C = rearrange(C, "(b l) dstate -> b dstate l", l=L).contiguous()
+                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            else:
+                C = rearrange(C, "(b l) (dstate two) -> b 1 dstate (l two)", l=L, two=2).contiguous()
+        else:
+            if C.stride(-1) != 1:
+                C = C.contiguous()
+        if D is not None:
+            D = D.contiguous()
+
+        if b_rms_weight is not None:
+            B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
+            B = rms_norm_forward(B, b_rms_weight, bias=None, eps=b_c_dt_rms_eps)
+            B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+        if c_rms_weight is not None:
+            C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
+            C = rms_norm_forward(C, c_rms_weight, bias=None, eps=b_c_dt_rms_eps)
+            C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+        if dt_rms_weight is not None:
+            delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous()
+            delta = rms_norm_forward(delta, dt_rms_weight, bias=None, eps=b_c_dt_rms_eps)
+            delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous()
+
+        out, scan_intermediates, out_z = selective_scan_cuda.fwd(
+            conv1d_out, delta, A, B, C, D, z, delta_bias, delta_softplus
+        )
+        ctx.delta_softplus = delta_softplus
+        ctx.out_proj_bias_is_None = out_proj_bias is None
+        ctx.checkpoint_lvl = checkpoint_lvl
+        ctx.b_rms_weight = b_rms_weight
+        ctx.c_rms_weight = c_rms_weight
+        ctx.dt_rms_weight = dt_rms_weight
+        ctx.b_c_dt_rms_eps = b_c_dt_rms_eps
+        if checkpoint_lvl >= 1:  # Will recompute conv1d_out and delta in the backward pass
+            conv1d_out, delta = None, None
+        ctx.save_for_backward(
+            xz,
+            conv1d_weight,
+            conv1d_bias,
+            x_dbl,
+            x_proj_weight,
+            delta_proj_weight,
+            out_proj_weight,
+            conv1d_out,
+            delta,
+            A,
+            B,
+            C,
+            D,
+            delta_bias,
+            scan_intermediates,
+            b_rms_weight,
+            c_rms_weight,
+            dt_rms_weight,
+            out,
+        )
+        return F.linear(rearrange(out_z, "b d l -> b l d"), out_proj_weight, out_proj_bias)
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, dout):
+        # dout: (batch, seqlen, dim)
+        assert causal_conv1d_cuda is not None, "causal_conv1d_cuda is not available. Please install causal-conv1d."
+        (
+            xz,
+            conv1d_weight,
+            conv1d_bias,
+            x_dbl,
+            x_proj_weight,
+            delta_proj_weight,
+            out_proj_weight,
+            conv1d_out,
+            delta,
+            A,
+            B,
+            C,
+            D,
+            delta_bias,
+            scan_intermediates,
+            b_rms_weight,
+            c_rms_weight,
+            dt_rms_weight,
+            out,
+        ) = ctx.saved_tensors
+        L = xz.shape[-1]
+        delta_rank = delta_proj_weight.shape[1]
+        d_state = A.shape[-1] * (1 if not A.is_complex() else 2)
+        x, z = xz.chunk(2, dim=1)
+        if dout.stride(-1) != 1:
+            dout = dout.contiguous()
+        if ctx.checkpoint_lvl == 1:
+            conv1d_out = causal_conv1d_cuda.causal_conv1d_fwd(x, conv1d_weight, conv1d_bias, None, None, None, True)
+            delta = rearrange(delta_proj_weight @ x_dbl[:, :delta_rank].t(), "d (b l) -> b d l", l=L)
+            if dt_rms_weight is not None:
+                delta = rearrange(delta, "b d l -> (b l) d", l=L).contiguous()
+                delta = rms_norm_forward(delta, ctx.dt_rms_weight, None, ctx.b_c_dt_rms_eps)
+                delta = rearrange(delta, "(b l) d -> b d l", l=L).contiguous()
+            if b_rms_weight is not None:
+                # Recompute & RMSNorm B
+                B = rearrange(B, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
+                B = rms_norm_forward(B, ctx.b_rms_weight, None, ctx.b_c_dt_rms_eps)
+                B = rearrange(B, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+            if c_rms_weight is not None:
+                # Recompute & RMSNorm C
+                C = rearrange(C, "b 1 dstate l -> (b l) dstate", l=L).contiguous()
+                C = rms_norm_forward(C, ctx.c_rms_weight, None, ctx.b_c_dt_rms_eps)
+                C = rearrange(C, "(b l) dstate -> b 1 dstate l", l=L).contiguous()
+
+        # The kernel supports passing in a pre-allocated dz (e.g., in case we want to fuse the
+        # backward of selective_scan_cuda with the backward of chunk).
+        dxz = torch.empty_like(xz)  # (batch, dim, seqlen)
+        dx, dz = dxz.chunk(2, dim=1)
+        dout = rearrange(dout, "b l e -> e (b l)")
+        dout_y = rearrange(out_proj_weight.t() @ dout, "d (b l) -> b d l", l=L)
+        dconv1d_out, ddelta, dA, dB, dC, dD, ddelta_bias, dz, out_z = selective_scan_cuda.bwd(
+            conv1d_out,
+            delta,
+            A,
+            B,
+            C,
+            D,
+            z,
+            delta_bias,
+            dout_y,
+            scan_intermediates,
+            out,
+            dz,
+            ctx.delta_softplus,
+            True,  # option to recompute out_z
+        )
+        dout_proj_weight = torch.einsum("eB,dB->ed", dout, rearrange(out_z, "b d l -> d (b l)"))
+        dout_proj_bias = dout.sum(dim=(0, 1)) if not ctx.out_proj_bias_is_None else None
+        dD = dD if D is not None else None
+        dx_dbl = torch.empty_like(x_dbl)
+        dB_proj_bias = None
+        if ctx.is_variable_B:
+            if not A.is_complex():
+                dB = rearrange(dB, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dB = rearrange(dB, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dB_proj_bias = dB.sum(0) if not ctx.B_proj_bias_is_None else None
+            dx_dbl[:, delta_rank : delta_rank + d_state] = dB  # (bl d)
+            dB = None
+        dC_proj_bias = None
+        if ctx.is_variable_C:
+            if not A.is_complex():
+                dC = rearrange(dC, "b 1 dstate l -> (b l) dstate").contiguous()
+            else:
+                dC = rearrange(dC, "b 1 dstate (l two) -> (b l) (dstate two)", two=2).contiguous()
+            dC_proj_bias = dC.sum(0) if not ctx.C_proj_bias_is_None else None
+            dx_dbl[:, -d_state:] = dC  # (bl d)
+            dC = None
+        ddelta = rearrange(ddelta, "b d l -> d (b l)")
+        ddelta_proj_weight = torch.einsum("dB,Br->dr", ddelta, x_dbl[:, :delta_rank])
+        dx_dbl[:, :delta_rank] = torch.einsum("dB,dr->Br", ddelta, delta_proj_weight)
+        dconv1d_out = rearrange(dconv1d_out, "b d l -> d (b l)")
+        dx_proj_weight = torch.einsum("Br,Bd->rd", dx_dbl, rearrange(conv1d_out, "b d l -> (b l) d"))
+        dconv1d_out = torch.addmm(dconv1d_out, x_proj_weight.t(), dx_dbl.t(), out=dconv1d_out)
+        dconv1d_out = rearrange(dconv1d_out, "d (b l) -> b d l", b=x.shape[0], l=x.shape[-1])
+        # The kernel supports passing in a pre-allocated dx (e.g., in case we want to fuse the
+        # backward of conv1d with the backward of chunk).
+        dx, dconv1d_weight, dconv1d_bias, *_ = causal_conv1d_cuda.causal_conv1d_bwd(
+            x, conv1d_weight, conv1d_bias, dconv1d_out, None, None, None, dx, False, True
+        )
+        dconv1d_bias = dconv1d_bias if conv1d_bias is not None else None
+        dconv1d_weight = rearrange(dconv1d_weight, "d w -> d 1 w")
+        return (
+            dxz,
+            dconv1d_weight,
+            dconv1d_bias,
+            dx_proj_weight,
+            ddelta_proj_weight,
+            dout_proj_weight,
+            dout_proj_bias,
+            dA,
+            dB,
+            dC,
+            dD,
+            ddelta_bias if delta_bias is not None else None,
+            # 6-None are delta_softplus, checkpoint_lvl, b_rms_weight, c_rms_weight, dt_rms_weight, b_c_dt_rms_eps
+            dB_proj_bias,
+            dC_proj_bias,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+def mamba_inner_fn(
+    xz,
+    conv1d_weight,
+    conv1d_bias,
+    x_proj_weight,
+    delta_proj_weight,
+    out_proj_weight,
+    out_proj_bias,
+    A,
+    B=None,
+    C=None,
+    D=None,
+    delta_bias=None,
+    B_proj_bias=None,
+    C_proj_bias=None,
+    delta_softplus=True,
+    checkpoint_lvl=1,
+    b_rms_weight=None,
+    c_rms_weight=None,
+    dt_rms_weight=None,
+    b_c_dt_rms_eps=1e-6,
+):
+    return MambaInnerFn.apply(
+        xz,
+        conv1d_weight,
+        conv1d_bias,
+        x_proj_weight,
+        delta_proj_weight,
+        out_proj_weight,
+        out_proj_bias,
+        A,
+        B,
+        C,
+        D,
+        delta_bias,
+        B_proj_bias,
+        C_proj_bias,
+        delta_softplus,
+        checkpoint_lvl,
+        b_rms_weight,
+        c_rms_weight,
+        dt_rms_weight,
+        b_c_dt_rms_eps,
+    )
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_kernel.cu b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..87ed89052873813153786bd416a981d3e5279af9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_kernel.cu
@@ -0,0 +1,383 @@
+#include "cuda_kernel.h"
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+__global__ void index_max_cuda_kernel(
+  float *index_vals,       // [batch_size, 32, num_block]
+  int   *indices,        // [batch_size, num_block]
+  float *max_vals,        // [batch_size, A_num_block * 32]
+  float *max_vals_scatter,   // [batch_size, 32, num_block]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.x;
+
+  long thread_idx = threadIdx.x;
+  long num_thread = blockDim.x;
+
+  extern __shared__ float buffer[];
+  int *max_buffer = (int*)buffer;
+
+  for (int i = 0; i < A_num_block * 32; i = i + num_thread) {
+    int idx = i + thread_idx;
+    if (idx < A_num_block * 32) {
+      max_buffer[idx] = -1e8;
+    }
+  }
+  __syncthreads();
+
+  int *indices_pt = &indices[batch_idx * num_block];
+  float *index_vals_pt = &index_vals[batch_idx * num_block * 32];
+
+  for (int idx_start = 0; idx_start < 32 * num_block; idx_start = idx_start + num_thread) {
+    int idx = idx_start + thread_idx;
+    int A_block_idx = indices_pt[idx % num_block] / B_num_block;
+    atomicMax(&max_buffer[A_block_idx * 32 + idx / num_block], (int)(index_vals_pt[idx] * 1000));
+  }
+  __syncthreads();
+  
+  float *max_vals_pt = &max_vals[batch_idx * A_num_block * 32];
+  for (int i = 0; i < A_num_block * 32; i = i + num_thread) {
+    int idx = i + thread_idx;
+    if (idx < A_num_block * 32) {
+      max_vals_pt[idx] = (float)max_buffer[idx] / 1000.;
+    }
+  }
+  
+  float *max_vals_scatter_pt = &max_vals_scatter[batch_idx * num_block * 32];
+  for (int idx_start = 0; idx_start < 32 * num_block; idx_start = idx_start + num_thread) {
+    int idx = idx_start + thread_idx;
+    int A_block_idx = indices_pt[idx % num_block] / B_num_block;
+    max_vals_scatter_pt[idx] = (float)max_buffer[A_block_idx * 32 + idx / num_block] / 1000.;
+  }
+
+}
+
+__global__ void mm_to_sparse_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, dim, 32]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  __shared__ float buffer[4096];
+  float *A_buffer = &buffer[threadIdx.y * 1024]; // [2, 8, 32]
+  float *B_buffer = &buffer[threadIdx.y * 1024 + 512]; // [2, 8, 32]
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *dense_A_pt = &dense_A[(batch_idx * A_num_block + AB_block_idx / B_num_block) * dim * 32];
+  float *dense_B_pt = &dense_B[(batch_idx * B_num_block + AB_block_idx % B_num_block) * dim * 32];
+
+  int reg_1_idx = thread_idx / 8;    // [0000000011111111222222223333333344444444555555556666666677777777]
+  int reg_2_idx = thread_idx % 8;    // [0123456701234567012345670123456701234567012345670123456701234567]
+
+  float reg_1[8];
+  float reg_2[8];
+
+  float reg_array[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    A_buffer[i * 64 + thread_idx] = dense_A_pt[i * 64 + thread_idx];
+    B_buffer[i * 64 + thread_idx] = dense_B_pt[i * 64 + thread_idx];
+  }
+
+  __syncthreads();
+
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    reg_1[i] = A_buffer[reg_1_idx * 4 + i];
+    reg_2[i] = B_buffer[reg_2_idx * 4 + i];
+  }
+
+  for (int dim_stride = 1; dim_stride < (dim / 8); dim_stride++) {
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      A_buffer[(dim_stride % 2) * 256 + i * 64 + thread_idx] = dense_A_pt[dim_stride * 256 + i * 64 + thread_idx];
+      B_buffer[(dim_stride % 2) * 256 + i * 64 + thread_idx] = dense_B_pt[dim_stride * 256 + i * 64 + thread_idx];
+    }
+
+    #pragma unroll
+    for (int mini_dim_idx = 1; mini_dim_idx < 8; mini_dim_idx++) {
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        reg_1[(mini_dim_idx % 2) * 4 + i] = A_buffer[((dim_stride - 1) % 2) * 256 + mini_dim_idx * 32 + reg_1_idx * 4 + i];
+        reg_2[(mini_dim_idx % 2) * 4 + i] = B_buffer[((dim_stride - 1) % 2) * 256 + mini_dim_idx * 32 + reg_2_idx * 4 + i];
+      }
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j];
+        }
+      }
+    }
+
+    __syncthreads();
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      reg_1[i] = A_buffer[(dim_stride % 2) * 256 + reg_1_idx * 4 + i];
+      reg_2[i] = B_buffer[(dim_stride % 2) * 256 + reg_2_idx * 4 + i];
+    }
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j];
+      }
+    }
+
+  }
+
+  #pragma unroll
+  for (int mini_dim_idx = 1; mini_dim_idx < 8; mini_dim_idx++) {
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      reg_1[(mini_dim_idx % 2) * 4 + i] = A_buffer[256 + mini_dim_idx * 32 + reg_1_idx * 4 + i];
+      reg_2[(mini_dim_idx % 2) * 4 + i] = B_buffer[256 + mini_dim_idx * 32 + reg_2_idx * 4 + i];
+    }
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j];
+      }
+    }
+  }
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j];
+    }
+  }
+  __syncthreads();
+
+  float *C_buffer = &buffer[threadIdx.y * 1024]; // [32, 32]
+
+  #pragma unroll
+  for (int i = 0; i < 4; i++) {
+    #pragma unroll
+    for (int j = 0; j < 4; j++) {
+      C_buffer[(reg_2_idx * 4 + j) * 32 + reg_1_idx * 4 + i] = reg_array[i * 4 + j];
+    }
+  }
+  __syncthreads();
+
+  float *sparse_C_pt = &sparse_C[batch_idx__block_idx * 1024];
+
+  #pragma unroll
+  for (int i = 0; i < 16; i++) {
+    sparse_C_pt[i * 64 + thread_idx] = C_buffer[i * 64 + thread_idx];
+  }
+
+}
+
+__global__ void sparse_dense_mm_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  float *dense_C,   // [batch_size, A_num_block, dim, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  __shared__ float buffer[6144];
+  float *A_buffer = &buffer[threadIdx.y * 3072]; // [32, 32]
+  float *B_buffer = &buffer[threadIdx.y * 3072 + 1024]; // [32, 64]
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  float *sparse_A_pt = &sparse_A[batch_idx__block_idx * 1024];
+  #pragma unroll
+  for (int i = 0; i < 8; i++) {
+    A_buffer[i * 128 + thread_idx] = sparse_A_pt[i * 128 + thread_idx];
+  }
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *dense_B_pt = &dense_B[(batch_idx * B_num_block + AB_block_idx % B_num_block) * 32 * dim];
+  float *dense_C_pt = &dense_C[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32 * dim];
+
+  // [0000000011111111222222223333333344444444555555556666666677777777]
+  // [0123456701234567012345670123456701234567012345670123456701234567]
+  int reg_1_idx = thread_idx / 8;
+  int reg_2_idx = thread_idx % 8;
+
+  float reg_1[8];
+  float reg_2[8];
+
+  float reg_array[16];
+
+  for (int dim_stride = 0; dim_stride < dim; dim_stride = dim_stride + 64) {
+
+    #pragma unroll
+    for (int i = 0; i < 16; i++) {
+      B_buffer[i * 128 + thread_idx] = dense_B_pt[dim_stride * 32 + i * 128 + thread_idx];
+    }
+
+    #pragma unroll
+    for (int i = 0; i < 16; i++) {
+      reg_array[i] = 0;
+    }
+
+    __syncthreads();
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      reg_1[i] = B_buffer[(reg_1_idx * 4 + i) * 32];
+      reg_2[i] = A_buffer[reg_2_idx * 4 + i];
+    }
+
+    #pragma unroll
+    for (int mini_dim_idx = 1; mini_dim_idx < 32; mini_dim_idx++) {
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        reg_1[(mini_dim_idx % 2) * 4 + i] = B_buffer[(reg_1_idx * 4 + i) * 32 + mini_dim_idx];
+        reg_2[(mini_dim_idx % 2) * 4 + i] = A_buffer[mini_dim_idx * 32 + reg_2_idx * 4 + i];
+      }
+      #pragma unroll
+      for (int i = 0; i < 4; i++) {
+        #pragma unroll
+        for (int j = 0; j < 4; j++) {
+          reg_array[i * 4 + j] += reg_1[((mini_dim_idx - 1) % 2) * 4 + i] * reg_2[((mini_dim_idx - 1) % 2) * 4 + j];
+        }
+      }
+    }
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        reg_array[i * 4 + j] += reg_1[4 + i] * reg_2[4 + j];
+      }
+    }
+
+    __syncthreads();
+
+    float *C_buffer = &buffer[threadIdx.y * 3072 + 1024]; // [64, 32]
+
+    #pragma unroll
+    for (int i = 0; i < 4; i++) {
+      #pragma unroll
+      for (int j = 0; j < 4; j++) {
+        C_buffer[(reg_1_idx * 4 + i) * 32 + reg_2_idx * 4 + j] = reg_array[i * 4 + j];
+      }
+    }
+    __syncthreads();
+
+    #pragma unroll
+    for (int i = 0; i < 16; i++) {
+      atomicAdd(&dense_C_pt[dim_stride * 32 + i * 128 + thread_idx], C_buffer[i * 128 + thread_idx]);
+    }
+    __syncthreads();
+
+  }
+
+}
+
+
+__global__ void reduce_sum_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_C,   // [batch_size, A_num_block, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *sparse_A_pt = &sparse_A[batch_idx__block_idx * 1024];
+
+  float reg_array[16];
+  float value = 0;
+
+  #pragma unroll
+  for (int i = 0; i < 8; i++) {
+    reg_array[i] = sparse_A_pt[i * 32 + thread_idx];
+  }
+  #pragma unroll
+  for (int stride = 8; stride < 32; stride = stride + 8) {
+    #pragma unroll
+    for (int i = 0; i < 8; i++) {
+      reg_array[(stride + i) % 16] = sparse_A_pt[(stride + i) * 32 + thread_idx];
+    }
+    #pragma unroll
+    for (int i = 0; i < 8; i++) {
+      value = value + reg_array[(stride - 8 + i) % 16];
+    }
+  }
+  #pragma unroll
+  for (int i = 0; i < 8; i++) {
+    value = value + reg_array[8 + i];
+  }
+
+  float *dense_C_pt = &dense_C[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32];
+
+  atomicAdd(&dense_C_pt[thread_idx], value);
+
+}
+
+__global__ void scatter_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+) {
+
+  long batch_idx = blockIdx.y;
+  long block_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  long thread_idx = threadIdx.x;
+
+  long batch_idx__block_idx = batch_idx * num_block + block_idx;
+
+  long AB_block_idx = indices[batch_idx__block_idx];
+  float *dense_A_pt = &dense_A[(batch_idx * A_num_block + AB_block_idx / B_num_block) * 32];
+  float *sparse_C_pt = &sparse_C[(batch_idx * num_block + block_idx) * 1024];
+
+  float value = dense_A_pt[thread_idx];
+
+  #pragma unroll
+  for (int i = 0; i < 32; i++) {
+    sparse_C_pt[i * 32 + thread_idx] = value;
+  }
+
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_kernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a95b46f7d159b11851143710034cf80c20aa6bf8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_kernel.h
@@ -0,0 +1,59 @@
+
+#define WARP_SIZE 32
+#define FULL_MASK 0xffffffff
+#define OPTIMAL_THREADS 256
+
+__global__ void index_max_cuda_kernel(
+  float *index_vals,       // [batch_size, 32, num_block]
+  int   *indices,        // [batch_size, num_block]
+  float *max_vals,        // [batch_size, A_num_block * 32]
+  float *max_vals_scatter,   // [batch_size, 32, num_block]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+);
+
+__global__ void mm_to_sparse_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, dim, 32]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+);
+
+__global__ void sparse_dense_mm_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_B,   // [batch_size, B_num_block, dim, 32]
+  float *dense_C,   // [batch_size, A_num_block, dim, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long dim,
+  long num_block
+);
+
+__global__ void reduce_sum_cuda_kernel(
+  float *sparse_A,  // [batch_size, num_block, 32, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *dense_C,   // [batch_size, A_num_block, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+);
+
+__global__ void scatter_cuda_kernel(
+  float *dense_A,   // [batch_size, A_num_block, 32]
+  int   *indices,   // [batch_size, num_block]
+  float *sparse_C,  // [batch_size, num_block, 32, 32]
+  long batch_size,
+  long A_num_block,
+  long B_num_block,
+  long num_block
+);
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_launch.cu b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_launch.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ba2a0cacfe614e75e06d2dde80dc77a6e8a4ec1a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_launch.cu
@@ -0,0 +1,154 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include "cuda_launch.h"
+#include "cuda_kernel.h"
+#include <vector>
+
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+std::vector<at::Tensor> index_max_kernel(
+  at::Tensor index_vals,  // [batch_size, 32, num_block]
+  at::Tensor indices,     // [batch_size, num_block],
+  int A_num_block,
+  int B_num_block
+) {
+  int batch_size = indices.size(0);
+  int num_block = indices.size(1);
+
+  at::Tensor max_vals = at::zeros({batch_size, A_num_block * 32}, index_vals.options());
+  at::Tensor max_vals_scatter = at::zeros({batch_size, 32, num_block}, index_vals.options());
+
+  dim3 threads(256);
+  dim3 blocks(batch_size);
+  int shared_mem = A_num_block * 32 * sizeof(float);
+
+  index_max_cuda_kernel<<<blocks, threads, shared_mem>>>(
+    index_vals.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    max_vals.data_ptr<float>(),
+    max_vals_scatter.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    num_block
+  );
+
+  return {max_vals, max_vals_scatter};
+}
+
+at::Tensor mm_to_sparse_kernel(
+  at::Tensor dense_A,  // [batch_size, A_num_block, dim, 32]
+  at::Tensor dense_B,  // [batch_size, B_num_block, dim, 32]
+  at::Tensor indices   // [batch_size, num_block]
+) {
+  int batch_size = dense_A.size(0);
+  int A_num_block = dense_A.size(1);
+  int B_num_block = dense_B.size(1);
+  int dim = dense_A.size(2);
+  int num_block = indices.size(1);
+
+  at::Tensor sparse_C = at::zeros({batch_size, num_block, 32, 32}, dense_A.options());
+
+  dim3 threads(64, 4);
+  dim3 blocks(num_block / 4, batch_size);
+
+  mm_to_sparse_cuda_kernel<<<blocks, threads>>>(
+    dense_A.data_ptr<float>(),
+    dense_B.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    sparse_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    dim,
+    num_block
+  );
+
+  return sparse_C;
+}
+
+at::Tensor sparse_dense_mm_kernel(
+  at::Tensor sparse_A,  // [batch_size, num_block, 32, 32]
+  at::Tensor indices,   // [batch_size, num_block]
+  at::Tensor dense_B,   // [batch_size, B_num_block, dim, 32]
+  int A_num_block
+) {
+  int batch_size = sparse_A.size(0);
+  int num_block = sparse_A.size(1);
+  int B_num_block = dense_B.size(1);
+  int dim = dense_B.size(2);
+
+  at::Tensor dense_C = at::zeros({batch_size, A_num_block, dim, 32}, dense_B.options());
+
+  dim3 threads(128, 2);
+  dim3 blocks(num_block / 2, batch_size);
+
+  sparse_dense_mm_cuda_kernel<<<blocks, threads>>>(
+    sparse_A.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    dense_B.data_ptr<float>(),
+    dense_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    dim,
+    num_block
+  );
+
+  return dense_C;
+}
+
+at::Tensor reduce_sum_kernel(
+  at::Tensor sparse_A,  // [batch_size, num_block, 32, 32]
+  at::Tensor indices,   // [batch_size, num_block]
+  int A_num_block,
+  int B_num_block
+) {
+  int batch_size = sparse_A.size(0);
+  int num_block = sparse_A.size(1);
+
+  at::Tensor dense_C = at::zeros({batch_size, A_num_block, 32}, sparse_A.options());
+
+  dim3 threads(32, 4);
+  dim3 blocks(num_block / 4, batch_size);
+
+  reduce_sum_cuda_kernel<<<blocks, threads>>>(
+    sparse_A.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    dense_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    num_block
+  );
+
+  return dense_C;
+}
+
+at::Tensor scatter_kernel(
+  at::Tensor dense_A,   // [batch_size, A_num_block, 32]
+  at::Tensor indices,   // [batch_size, num_block]
+  int B_num_block
+) {
+  int batch_size = dense_A.size(0);
+  int A_num_block = dense_A.size(1);
+  int num_block = indices.size(1);
+
+  at::Tensor sparse_C = at::zeros({batch_size, num_block, 32, 32}, dense_A.options());
+
+  dim3 threads(32, 4);
+  dim3 blocks(num_block / 4, batch_size);
+
+  scatter_cuda_kernel<<<blocks, threads>>>(
+    dense_A.data_ptr<float>(),
+    indices.data_ptr<int>(),
+    sparse_C.data_ptr<float>(),
+    batch_size,
+    A_num_block,
+    B_num_block,
+    num_block
+  );
+
+  return sparse_C;
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_launch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_launch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0200140ee337b8c5d9583767bbad1e842e9d4677
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/cuda_launch.h
@@ -0,0 +1,39 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <vector>
+
+#define min(a, b) ((a)<(b)?(a):(b))
+#define max(a, b) ((a)>(b)?(a):(b))
+
+std::vector<at::Tensor> index_max_kernel(
+  at::Tensor index_vals,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+);
+
+at::Tensor mm_to_sparse_kernel(
+  at::Tensor dense_A,
+  at::Tensor dense_B,
+  at::Tensor indices
+);
+
+at::Tensor sparse_dense_mm_kernel(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  at::Tensor dense_B,
+  int A_num_block
+);
+
+at::Tensor reduce_sum_kernel(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+);
+
+at::Tensor scatter_kernel(
+  at::Tensor dense_A,
+  at::Tensor indices,
+  int B_num_block
+);
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/torch_extension.cpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/torch_extension.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..60c9262b779270a6e95ae54f53a67daa6d740a9e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/mra/torch_extension.cpp
@@ -0,0 +1,78 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include "cuda_launch.h"
+#include <vector>
+
+std::vector<at::Tensor> index_max(
+  at::Tensor index_vals,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+) {
+  return index_max_kernel(
+    index_vals,
+    indices,
+    A_num_block,
+    B_num_block
+  );
+}
+
+at::Tensor mm_to_sparse(
+  at::Tensor dense_A,
+  at::Tensor dense_B,
+  at::Tensor indices
+) {
+  return mm_to_sparse_kernel(
+    dense_A,
+    dense_B,
+    indices
+  );
+}
+
+at::Tensor sparse_dense_mm(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  at::Tensor dense_B,
+  int A_num_block
+) {
+  return sparse_dense_mm_kernel(
+    sparse_A,
+    indices,
+    dense_B,
+    A_num_block
+  );
+}
+
+at::Tensor reduce_sum(
+  at::Tensor sparse_A,
+  at::Tensor indices,
+  int A_num_block,
+  int B_num_block
+) {
+  return reduce_sum_kernel(
+    sparse_A,
+    indices,
+    A_num_block,
+    B_num_block
+  );
+}
+
+at::Tensor scatter(
+  at::Tensor dense_A,
+  at::Tensor indices,
+  int B_num_block
+) {
+  return scatter_kernel(
+    dense_A,
+    indices,
+    B_num_block
+  );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("index_max", &index_max, "index_max (CUDA)");
+  m.def("mm_to_sparse", &mm_to_sparse, "mm_to_sparse (CUDA)");
+  m.def("sparse_dense_mm", &sparse_dense_mm, "sparse_dense_mm (CUDA)");
+  m.def("reduce_sum", &reduce_sum, "reduce_sum (CUDA)");
+  m.def("scatter", &scatter, "scatter (CUDA)");
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_cuda.cu b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..571d5a8a8307e95aac689eb3c9333d1ad350c7de
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_cuda.cu
@@ -0,0 +1,187 @@
+#include <stdio.h>
+#include <assert.h>
+
+#define MIN_VALUE (-1e38)
+
+template <typename F>
+__global__ void kernel_forward(
+    const int B, const int T, const int C, const F *__restrict__ const _w, const F *__restrict__ const _u,
+    const F *__restrict__ const _k, const F *__restrict__ const _v, F *__restrict__ const _y
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset = _b * T * C + _c;
+
+    F u = _u[_c];
+    F w = _w[_c];
+    const F *__restrict__ const k = _k + _offset;
+    const F *__restrict__ const v = _v + _offset;
+    F *__restrict__ const y = _y + _offset;
+
+    // aa and bb are running sums divided by exp(pp) (to avoid overflow)
+    F aa = 0, bb = 0, pp = MIN_VALUE;
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const F kk = k[ii];
+        const F vv = v[ii];
+
+        F ww = u + kk;
+        F p = max(pp, ww);
+        F e1 = exp(pp - p);
+        F e2 = exp(ww - p);
+        y[ii] = (e1 * aa + e2 * vv) / (e1 * bb + e2);
+        
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+}
+
+template <typename F>
+__global__ void kernel_forward_with_state(
+    const int B, const int T, const int C, const F *__restrict__ const _w, const F *__restrict__ const _u,
+    const F *__restrict__ const _k, const F *__restrict__ const _v, F *__restrict__ const _y, F *__restrict__ const _s
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset_s = _b * C * 3 + _c * 3;
+    const int _offset = _b * T * C + _c;
+
+    F u = _u[_c];
+    F w = _w[_c];
+    const F *__restrict__ const k = _k + _offset;
+    const F *__restrict__ const v = _v + _offset;
+    F *__restrict__ const y = _y + _offset;
+    F *__restrict__ const s = _s + _offset_s;
+
+    // aa and bb are running sums divided by exp(pp) (to avoid overflow)
+    F aa = s[0], bb = s[1], pp = s[2];
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const F kk = k[ii];
+        const F vv = v[ii];
+
+        F ww = u + kk;
+        F p = max(pp, ww);
+        F e1 = exp(pp - p);
+        F e2 = exp(ww - p);
+        y[ii] = (e1 * aa + e2 * vv) / (e1 * bb + e2);
+        
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+    s[0] = aa;
+    s[1] = bb;
+    s[2] = pp;
+}
+
+template <typename F>
+__global__ void kernel_backward(
+    const int B, const int T, const int C, const F *__restrict__ const _w, const F *__restrict__ const _u,
+    const F *__restrict__ const _k, const F *__restrict__ const _v, const F *__restrict__ const _y,
+    const F *__restrict__ const _gy, F *__restrict__ const _gw, F *__restrict__ const _gu, F *__restrict__ const _gk,
+    F *__restrict__ const _gv
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset = _b * T * C + _c;
+
+    F u = _u[_c];
+    F w = _w[_c];
+    const F *__restrict__ const k = _k + _offset;
+    const F *__restrict__ const v = _v + _offset;
+    const F *__restrict__ const y = _y + _offset;
+    const F *__restrict__ const gy = _gy + _offset;
+    F *__restrict__ const gk = _gk + _offset;
+    F *__restrict__ const gv = _gv + _offset;
+
+    F q[Tmax], r[Tmax];
+
+    F gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE;
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const F kk = k[ii];
+        const F vv = v[ii];
+        const F yy = y[ii];
+
+        F ww = u + kk;
+        F p = max(pp, ww);
+        F e1 = exp(pp - p);
+        F e2 = exp(ww - p);
+        const F qq = gy[ii] / (e1 * bb + e2);
+        gw += (ga - gb * yy) * e1 * qq;
+        gu += (vv - yy) * e2 * qq;
+        q[i] = qq;
+        r[i] = ww - p;
+
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        ga = e1 * (aa + ga);
+        gb = e1 * (bb + gb);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+    const int _offsetBC = _b * C + _c;
+    _gw[_offsetBC] = gw * _w[_c]; // multiply by w because of w -> -exp(w) in python forward()
+    _gu[_offsetBC] = gu;
+
+    aa = 0, bb = 0, pp = MIN_VALUE;
+    for (int i = T - 1; i >= 0; i--) {
+        const int ii = i * C;
+        const F kk = k[ii];
+        const F vv = v[ii];
+        const F yy = y[ii];
+        const F qq = q[i];
+        const F rr = r[i];
+
+        F e1 = qq * exp(rr);
+        F e2 = exp(kk + pp);
+        gk[ii] = e1 * (vv - yy) + e2 * (aa * vv + bb);
+        gv[ii] = e1 + e2 * aa;
+
+        const F ww = w + pp;
+        const F www = rr - u - kk;
+        const F p = max(ww, www);
+        e1 = exp(ww - p);
+        e2 = qq * exp(www - p);
+        aa = e1 * aa + e2;
+        bb = e1 * bb - e2 * yy;
+        pp = p;
+    }
+}
+
+void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_forward<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y);
+}
+
+void cuda_forward_with_state(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *s) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_forward_with_state<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y, s);
+}
+
+void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_backward<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv);
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_cuda_bf16.cu b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_cuda_bf16.cu
new file mode 100644
index 0000000000000000000000000000000000000000..042cb4aba1db98be5916aea1de86a7fed0b6510d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_cuda_bf16.cu
@@ -0,0 +1,186 @@
+#include <stdio.h>
+#include <assert.h>
+#include "ATen/ATen.h"
+#define MIN_VALUE (-1e38)
+typedef at::BFloat16 bf16;
+
+__global__ void kernel_forward_bf16(
+    const int B, const int T, const int C, const float *__restrict__ const _w, const bf16 *__restrict__ const _u,
+    const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v, bf16 *__restrict__ const _y
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset = _b * T * C + _c;
+
+    float u = float(_u[_c]);
+    float w = _w[_c];
+    const bf16 *__restrict__ const k = _k + _offset;
+    const bf16 *__restrict__ const v = _v + _offset;
+    bf16 *__restrict__ const y = _y + _offset;
+
+    // aa and bb are running sums divided by exp(pp) (to avoid overflow)
+    float aa = 0, bb = 0, pp = MIN_VALUE;
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const float kk = float(k[ii]);
+        const float vv = float(v[ii]);
+
+        float ww = u + kk;
+        float p = max(pp, ww);
+        float e1 = exp(pp - p);
+        float e2 = exp(ww - p);
+        y[ii] = bf16((e1 * aa + e2 * vv) / (e1 * bb + e2));
+        
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+}
+
+__global__ void kernel_forward_with_state_bf16(
+    const int B, const int T, const int C, const float *__restrict__ const _w, const bf16 *__restrict__ const _u,
+    const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v, bf16 *__restrict__ const _y,
+    float *__restrict__ const _s
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset_s = _b * C * 3 + _c * 3;
+    const int _offset = _b * T * C + _c;
+
+    float u = float(_u[_c]);
+    float w = _w[_c];
+    const bf16 *__restrict__ const k = _k + _offset;
+    const bf16 *__restrict__ const v = _v + _offset;
+    bf16 *__restrict__ const y = _y + _offset;
+    float *__restrict__ const s = _s + _offset_s;
+
+    // aa and bb are running sums divided by exp(pp) (to avoid overflow)
+    float aa = s[0], bb = s[1], pp = s[2];
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const float kk = float(k[ii]);
+        const float vv = float(v[ii]);
+
+        float ww = u + kk;
+        float p = max(pp, ww);
+        float e1 = exp(pp - p);
+        float e2 = exp(ww - p);
+        y[ii] = bf16(e1 * aa + e2 * vv) / (e1 * bb + e2);
+        
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+    s[0] = aa;
+    s[1] = bb;
+    s[2] = pp;
+}
+
+__global__ void kernel_backward_bf16(
+    const int B, const int T, const int C, const float *__restrict__ const _w, const bf16 *__restrict__ const _u,
+    const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v, const bf16 *__restrict__ const _y,
+    const bf16 *__restrict__ const _gy, bf16 *__restrict__ const _gw, bf16 *__restrict__ const _gu,
+    bf16 *__restrict__ const _gk, bf16 *__restrict__ const _gv
+) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int _b = idx / C;
+    const int _c = idx % C;
+    const int _offset = _b * T * C + _c;
+
+    float u = float(_u[_c]);
+    float w = _w[_c];
+    const bf16 *__restrict__ const k = _k + _offset;
+    const bf16 *__restrict__ const v = _v + _offset;
+    const bf16 *__restrict__ const y = _y + _offset;
+    const bf16 *__restrict__ const gy = _gy + _offset;
+    bf16 *__restrict__ const gk = _gk + _offset;
+    bf16 *__restrict__ const gv = _gv + _offset;
+
+    float q[Tmax], r[Tmax];
+
+    float gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE;
+    for (int i = 0; i < T; i++) {
+        const int ii = i * C;
+        const float kk = float(k[ii]);
+        const float vv = float(v[ii]);
+        const float yy = float(y[ii]);
+
+        float ww = u + kk;
+        float p = max(pp, ww);
+        float e1 = exp(pp - p);
+        float e2 = exp(ww - p);
+        const float qq = float(gy[ii]) / (e1 * bb + e2);
+        gw += (ga - gb * yy) * e1 * qq;
+        gu += (vv - yy) * e2 * qq;
+        q[i] = qq;
+        r[i] = ww - p;
+
+        ww = w + pp;
+        p = max(ww, kk);
+        e1 = exp(ww - p);
+        e2 = exp(kk - p);
+        ga = e1 * (aa + ga);
+        gb = e1 * (bb + gb);
+        aa = e1 * aa + e2 * vv;
+        bb = e1 * bb + e2;
+        pp = p;
+    }
+    const int _offsetBC = _b * C + _c;
+    _gw[_offsetBC] = bf16(gw * _w[_c]); // multiply by w because of w -> -exp(w) in python forward()
+    _gu[_offsetBC] = bf16(gu);
+
+    aa = 0, bb = 0, pp = MIN_VALUE;
+    for (int i = T - 1; i >= 0; i--) {
+        const int ii = i * C;
+        const float kk = float(k[ii]);
+        const float vv = float(v[ii]);
+        const float yy = float(y[ii]);
+        const float qq = q[i];
+        const float rr = r[i];
+
+        float e1 = qq * exp(rr);
+        float e2 = exp(kk + pp);
+        gk[ii] = bf16(e1 * (vv - yy) + e2 * (aa * vv + bb));
+        gv[ii] = bf16(e1 + e2 * aa);
+
+        const float ww = w + pp;
+        const float www = rr - u - kk;
+        const float p = max(ww, www);
+        e1 = exp(ww - p);
+        e2 = qq * exp(www - p);
+        aa = e1 * aa + e2;
+        bb = e1 * bb - e2 * yy;
+        pp = p;
+    }
+}
+
+void cuda_forward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_forward_bf16<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y);
+}
+
+void cuda_forward_with_state_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, float *s) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_forward_with_state_bf16<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y, s);
+}
+
+void cuda_backward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv) {
+    dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance
+    assert(B * C % threadsPerBlock.x == 0);
+    dim3 numBlocks(B * C / threadsPerBlock.x);
+    kernel_backward_bf16<<<numBlocks, threadsPerBlock>>>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv);
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_op.cpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..55e7280665927b523a88021d5111daf28a63c905
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/rwkv/wkv_op.cpp
@@ -0,0 +1,66 @@
+#include <torch/extension.h>
+#include "ATen/ATen.h"
+typedef at::BFloat16 bf16;
+
+void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y);
+void cuda_forward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y);
+void cuda_forward_with_state(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *s);
+void cuda_forward_with_state_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, float *s);
+void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv);
+void cuda_backward_bf16(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv);
+
+void forward(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_forward(B, T, C, w.data_ptr<float>(), u.data_ptr<float>(), k.data_ptr<float>(), v.data_ptr<float>(), y.data_ptr<float>());
+}
+void forward_bf16(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_forward_bf16(B, T, C, w.data_ptr<float>(), u.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), y.data_ptr<bf16>());
+}
+void forward_with_state(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &s) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_forward_with_state(B, T, C, w.data_ptr<float>(), u.data_ptr<float>(), k.data_ptr<float>(), v.data_ptr<float>(), y.data_ptr<float>(), s.data_ptr<float>());
+}
+void forward_with_state_bf16(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &s) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_forward_with_state_bf16(B, T, C, w.data_ptr<float>(), u.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), y.data_ptr<bf16>(), s.data_ptr<float>());
+}
+void backward(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_backward(B, T, C, w.data_ptr<float>(), u.data_ptr<float>(), k.data_ptr<float>(), v.data_ptr<float>(), y.data_ptr<float>(), gy.data_ptr<float>(), gw.data_ptr<float>(), gu.data_ptr<float>(), gk.data_ptr<float>(), gv.data_ptr<float>());
+}
+void backward_bf16(torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) {
+    const int B = k.size(0);
+    const int T = k.size(1);
+    const int C = k.size(2);
+    cuda_backward_bf16(B, T, C, w.data_ptr<float>(), u.data_ptr<bf16>(), k.data_ptr<bf16>(), v.data_ptr<bf16>(), y.data_ptr<bf16>(),
+        gy.data_ptr<bf16>(), gw.data_ptr<bf16>(), gu.data_ptr<bf16>(), gk.data_ptr<bf16>(), gv.data_ptr<bf16>());
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &forward, "wkv forward");
+    m.def("forward_bf16", &forward_bf16, "wkv forward bf16");
+    m.def("forward_with_state", &forward_with_state, "wkv forward with state");
+    m.def("forward_with_state_bf16", &forward_with_state_bf16, "wkv forward with state bf16");
+    m.def("backward", &backward, "wkv backward");
+    m.def("backward_bf16", &backward_bf16, "wkv backward bf16");
+}
+
+TORCH_LIBRARY(wkv, m) {
+    m.def("forward", forward);
+    m.def("forward_bf16", forward_bf16);
+    m.def("forward_with_state", forward_with_state);
+    m.def("forward_with_state_bf16", forward_with_state_bf16);
+    m.def("backward", backward);
+    m.def("backward_bf16", backward_bf16);
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5085c88dd3ea9a12eec264a8c48946bf2b80b23
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common.h
@@ -0,0 +1,10 @@
+
+#define min(a, b) ((a)<(b)?(a):(b))
+#define max(a, b) ((a)>(b)?(a):(b))
+#define ceil_divide(a, b) ((a)/(b)+((a)%(b)!=0))
+#define select(cond, a, b) ((cond)?(a):(b))
+#define PI 3.141592
+#define EPSILON 1e-8
+#define MAX_VAL 1e12
+#define MIN_VAL -1e12
+#define EMPTY_VALUE -1
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common_cuda.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..97030870649a2fdac58cb26cf966e8f5c8cc7909
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common_cuda.h
@@ -0,0 +1,9 @@
+
+#define MAX_THREADS_PER_BLOCK 1024
+#define OPTIMAL_THREADS_PER_BLOCK 256
+#define WARP_SIZE 32
+#define MAX_NUM_BLOCK_X 2147483647
+#define MAX_NUM_BLOCK_Y 65535
+#define MAX_NUM_BLOCK_Z 65535
+#define MAX_SHARED_MEM_PER_BLOCK 48000
+#define FULL_MASK 0xffffffff
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common_cuda_device.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common_cuda_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..6674f93afdc25ab35c5d83881d00028bcf2989fc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/common_cuda_device.h
@@ -0,0 +1,79 @@
+
+#include "common.h"
+
+template<typename T>
+__device__ int set_insert(T *set, int set_size, T value) {
+  int slot = value % set_size;
+  int start_slot = slot;
+  while (true) {
+    T prev = atomicCAS(&set[slot], EMPTY_VALUE, value);
+    if (prev == EMPTY_VALUE || prev == value) {
+      return slot;
+    }
+    slot = (slot + 1) % set_size;
+    if (slot == start_slot) {
+      return -1;
+    }
+  }
+  return -1;
+}
+
+template<typename T>
+__device__ int set_lookup(T *set, int set_size, T value) {
+  int slot = value % set_size;
+  int start_slot = slot;
+  while (true) {
+    if (set[slot] == value) {
+      return slot;
+    }
+    slot = (slot + 1) % set_size;
+    if (slot == start_slot) {
+      return -1;
+    }
+  }
+  return -1;
+}
+
+template<typename T>
+__device__ void init_buffer(T init_value, T *buffer, int buffer_size, int num_threads, int thread_id) {
+  __syncthreads();
+  for (int i = 0; i < buffer_size; i = i + num_threads) {
+    int offset_idx = i + thread_id;
+    if (offset_idx < buffer_size) {
+      buffer[offset_idx] = init_value;
+    }
+  }
+  __syncthreads();
+}
+
+template<typename T>
+__device__ void copy_data(T *src_pt, T *dist_pt, int data_length, int num_threads, int thread_id) {
+  __syncthreads();
+  for (int i = 0; i < data_length; i = i + num_threads) {
+    int offset_idx = i + thread_id;
+    if (offset_idx < data_length) {
+      dist_pt[offset_idx] = src_pt[offset_idx];
+    }
+  }
+  __syncthreads();
+}
+
+template<typename T>
+__device__ void init_buffer_nonblocking(T init_value, T *buffer, int buffer_size, int num_threads, int thread_id) {
+  for (int i = 0; i < buffer_size; i = i + num_threads) {
+    int offset_idx = i + thread_id;
+    if (offset_idx < buffer_size) {
+      buffer[offset_idx] = init_value;
+    }
+  }
+}
+
+template<typename T>
+__device__ void copy_data_nonblocking(T *src_pt, T *dist_pt, int data_length, int num_threads, int thread_id) {
+  for (int i = 0; i < data_length; i = i + num_threads) {
+    int offset_idx = i + thread_id;
+    if (offset_idx < data_length) {
+      dist_pt[offset_idx] = src_pt[offset_idx];
+    }
+  }
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation.cu b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c6b13e6cb5f53c9c62e51d2c399a14d14dab7037
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation.cu
@@ -0,0 +1,588 @@
+// File from https://github.com/mlpen/YOSO/blob/main/encoders/backbones/efficient_attentions/yoso/yoso_v1/cuda/fast_lsh_cumulation.cu
+
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include "fast_lsh_cumulation.h"
+#include "fast_lsh_cumulation_cuda.h"
+#include "common_cuda.h"
+#include "common.h"
+#include <vector>
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+std::vector<at::Tensor> fast_hash_ver1_kernel(
+  at::Tensor query_mask,
+  at::Tensor query_vector,
+  at::Tensor key_mask,
+  at::Tensor key_vector,
+  int num_hash_f,
+  int hash_code_len,
+  bool use_cuda
+) {
+
+  int batch_size = query_vector.size(0);
+  int num_query = query_vector.size(1);
+  int num_key = key_vector.size(1);
+  int vector_dim = query_vector.size(2);
+
+  int num_hash_per_part = vector_dim / hash_code_len;
+  int num_part = max(1, ceil_divide(num_hash_f, num_hash_per_part));
+
+  at::Tensor Dmat = 2 * at::randint(0, 2, {batch_size, 3, num_part, vector_dim}, query_mask.options()) - 1;
+  at::Tensor query_hash_code = at::zeros({batch_size, num_query, num_hash_f}, query_mask.options());
+  at::Tensor key_hash_code = at::zeros({batch_size, num_key, num_hash_f}, key_mask.options());
+
+  int *query_mask_ptr = query_mask.data_ptr<int>();
+  float *query_vector_ptr = query_vector.data_ptr<float>();
+  int *key_mask_ptr = key_mask.data_ptr<int>();
+  float *key_vector_ptr = key_vector.data_ptr<float>();
+
+  int *Dmat_ptr = Dmat.data_ptr<int>();
+
+  int *query_hash_code_ptr = query_hash_code.data_ptr<int>();
+  int *key_hash_code_ptr = key_hash_code.data_ptr<int>();
+
+  if (use_cuda) {
+    {
+      dim3 threads(vector_dim);
+      dim3 blocks(num_part, num_query, batch_size);
+      int shared_mem = vector_dim * sizeof(float);
+      fast_hash_ver1_cuda_kernel<<<blocks, threads, shared_mem>>>(
+        query_mask_ptr,
+        query_vector_ptr,
+        Dmat_ptr,
+        query_hash_code_ptr,
+        batch_size,
+        num_query,
+        vector_dim,
+        num_part,
+        num_hash_f,
+        hash_code_len
+      );
+    }
+    {
+      dim3 threads(vector_dim);
+      dim3 blocks(num_part, num_key, batch_size);
+      int shared_mem = vector_dim * sizeof(float);
+      fast_hash_ver1_cuda_kernel<<<blocks, threads, shared_mem>>>(
+        key_mask_ptr,
+        key_vector_ptr,
+        Dmat_ptr,
+        key_hash_code_ptr,
+        batch_size,
+        num_key,
+        vector_dim,
+        num_part,
+        num_hash_f,
+        hash_code_len
+      );
+    }
+  }
+
+  return {query_hash_code, key_hash_code};
+
+}
+
+at::Tensor lsh_cumulation_ver1_kernel(
+  at::Tensor query_mask,
+  at::Tensor query_hash_code,
+  at::Tensor key_mask,
+  at::Tensor key_hash_code,
+  at::Tensor value,
+  int hashtable_capacity,
+  bool use_cuda
+) {
+
+  int batch_size = query_hash_code.size(0);
+  int num_hash_f = query_hash_code.size(2);
+
+  int num_query = query_hash_code.size(1);
+  int num_key = key_hash_code.size(1);
+  int value_dim = value.size(2);
+
+  at::Tensor hashtable_value = at::empty({batch_size, num_hash_f, hashtable_capacity, WARP_SIZE}, value.options());
+  at::Tensor cumulation_value = at::zeros({batch_size, num_query, value_dim}, value.options());
+
+  if (use_cuda) {
+    int threads_x = WARP_SIZE;
+    int threads_y = OPTIMAL_THREADS_PER_BLOCK / WARP_SIZE;
+    int block_x_step1 = num_key / threads_y;
+    int block_x_step2 = num_query / threads_y;
+    int block_y = batch_size;
+
+    dim3 threads(threads_x, threads_y);
+    dim3 blocks_step1(block_x_step1, block_y);
+    dim3 blocks_step2(block_x_step2, block_y);
+
+    int *query_mask_ptr = query_mask.data_ptr<int>();
+    int *query_hash_code_ptr = query_hash_code.data_ptr<int>();
+    int *key_mask_ptr = key_mask.data_ptr<int>();
+    int *key_hash_code_ptr = key_hash_code.data_ptr<int>();
+    float *value_ptr = value.data_ptr<float>();
+    float *hashtable_value_ptr = hashtable_value.data_ptr<float>();
+    float *cumulation_value_ptr = cumulation_value.data_ptr<float>();
+
+    for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) {
+
+      cudaMemset(hashtable_value_ptr, 0, (batch_size * num_hash_f * hashtable_capacity * WARP_SIZE) * sizeof(float));
+
+      lsh_cumulation_ver1_step1_cuda_kernel<<<blocks_step1, threads>>>(
+        key_mask_ptr,
+        key_hash_code_ptr,
+        value_ptr,
+        hashtable_value_ptr,
+        batch_size,
+        num_hash_f,
+        hashtable_capacity,
+        num_key,
+        value_dim,
+        value_offset
+      );
+
+      lsh_cumulation_ver1_step2_cuda_kernel<<<blocks_step2, threads>>>(
+        query_mask_ptr,
+        query_hash_code_ptr,
+        hashtable_value_ptr,
+        cumulation_value_ptr,
+        batch_size,
+        num_hash_f,
+        hashtable_capacity,
+        num_query,
+        value_dim,
+        value_offset
+      );
+    }
+
+  }
+
+  return cumulation_value;
+
+}
+
+at::Tensor lsh_weighted_cumulation_ver1_kernel(
+  at::Tensor query_mask,
+  at::Tensor query_hash_code,
+  at::Tensor query_weight,
+  at::Tensor key_mask,
+  at::Tensor key_hash_code,
+  at::Tensor key_weight,
+  at::Tensor value,
+  int hashtable_capacity,
+  bool use_cuda
+) {
+
+  int batch_size = query_hash_code.size(0);
+  int num_hash_f = query_hash_code.size(2);
+
+  int num_query = query_hash_code.size(1);
+  int num_key = key_hash_code.size(1);
+  int value_dim = value.size(2);
+  int weight_dim = query_weight.size(2);
+
+  at::Tensor hashtable_value = at::zeros({batch_size, num_hash_f, hashtable_capacity, WARP_SIZE}, value.options());
+  at::Tensor cumulation_value = at::zeros({batch_size, num_query, value_dim}, value.options());
+
+  if (use_cuda) {
+    int threads_x = WARP_SIZE;
+    int threads_y = OPTIMAL_THREADS_PER_BLOCK / WARP_SIZE;
+    int block_x_step1 = num_key / threads_y;
+    int block_x_step2 = num_query / threads_y;
+    int block_y = batch_size;
+
+    dim3 threads(threads_x, threads_y);
+    dim3 blocks_step1(block_x_step1, block_y);
+    dim3 blocks_step2(block_x_step2, block_y);
+
+    int *query_mask_ptr = query_mask.data_ptr<int>();
+    int *query_hash_code_ptr = query_hash_code.data_ptr<int>();
+    float *query_weight_ptr = query_weight.data_ptr<float>();
+    int *key_mask_ptr = key_mask.data_ptr<int>();
+    int *key_hash_code_ptr = key_hash_code.data_ptr<int>();
+    float *key_weight_ptr = key_weight.data_ptr<float>();
+    float *value_ptr = value.data_ptr<float>();
+    float *hashtable_value_ptr = hashtable_value.data_ptr<float>();
+    float *cumulation_value_ptr = cumulation_value.data_ptr<float>();
+
+    for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) {
+      for (int weight_idx = 0; weight_idx < weight_dim; weight_idx++) {
+
+        cudaMemset(hashtable_value_ptr, 0, (batch_size * num_hash_f * hashtable_capacity * WARP_SIZE) * sizeof(float));
+
+        lsh_weighted_cumulation_ver1_step1_cuda_kernel<<<blocks_step1, threads>>>(
+          key_mask_ptr,
+          key_hash_code_ptr,
+          key_weight_ptr,
+          value_ptr,
+          hashtable_value_ptr,
+          batch_size,
+          num_hash_f,
+          hashtable_capacity,
+          num_key,
+          value_dim,
+          weight_dim,
+          value_offset,
+          weight_idx
+        );
+
+        lsh_weighted_cumulation_ver1_step2_cuda_kernel<<<blocks_step2, threads>>>(
+          query_mask_ptr,
+          query_hash_code_ptr,
+          query_weight_ptr,
+          hashtable_value_ptr,
+          cumulation_value_ptr,
+          batch_size,
+          num_hash_f,
+          hashtable_capacity,
+          num_query,
+          value_dim,
+          weight_dim,
+          value_offset,
+          weight_idx
+        );
+      }
+    }
+
+  }
+
+  return cumulation_value;
+
+}
+
+at::Tensor lsh_weighted_cumulation_ver2_kernel(
+  at::Tensor query_mask,
+  at::Tensor query_hash_code,
+  at::Tensor query_weight,
+  at::Tensor key_mask,
+  at::Tensor key_hash_code,
+  at::Tensor key_weight,
+  at::Tensor value,
+  int hashtable_capacity,
+  bool use_cuda
+) {
+
+  int batch_size = query_hash_code.size(0);
+  int num_hash_f = query_hash_code.size(2);
+
+  int num_query = query_hash_code.size(1);
+  int num_key = key_hash_code.size(1);
+  int value_dim = value.size(2);
+  int weight_dim = query_weight.size(2);
+
+  at::Tensor count_sort_table = at::zeros({batch_size, num_hash_f, hashtable_capacity}, query_hash_code.options());
+  at::Tensor key_sorted_idxes = at::zeros({batch_size, num_hash_f, num_key}, query_hash_code.options());
+  at::Tensor query_info = at::zeros({batch_size, num_query, 2, num_hash_f}, query_hash_code.options());
+  at::Tensor cumulation_value = at::zeros({batch_size, num_query, value_dim}, value.options());
+
+  if (use_cuda) {
+
+    int *query_mask_ptr = query_mask.data_ptr<int>();
+    int *query_hash_code_ptr = query_hash_code.data_ptr<int>();
+    float *query_weight_ptr = query_weight.data_ptr<float>();
+    int *key_mask_ptr = key_mask.data_ptr<int>();
+    int *key_hash_code_ptr = key_hash_code.data_ptr<int>();
+    float *key_weight_ptr = key_weight.data_ptr<float>();
+    float *value_ptr = value.data_ptr<float>();
+
+    int *count_sort_table_ptr = count_sort_table.data_ptr<int>();
+    int *key_sorted_idxes_ptr = key_sorted_idxes.data_ptr<int>();
+    int *query_info_ptr = query_info.data_ptr<int>();
+
+    float *cumulation_value_ptr = cumulation_value.data_ptr<float>();
+
+    {
+      dim3 threads_step13(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f));
+      dim3 blocks_step13(num_key / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size);
+      dim3 threads_step2(min(hashtable_capacity, OPTIMAL_THREADS_PER_BLOCK));
+      dim3 blocks_step2(num_hash_f, batch_size);
+      int shared_mem = hashtable_capacity * sizeof(float);
+      count_sort_step1_cuda_kernel<<<blocks_step13, threads_step13>>>(
+        key_mask_ptr,
+        key_hash_code_ptr,
+        count_sort_table_ptr,
+        batch_size,
+        num_hash_f,
+        hashtable_capacity,
+        num_key
+      );
+      count_sort_step2_cuda_kernel<<<blocks_step2, threads_step2, shared_mem>>>(
+        count_sort_table_ptr,
+        batch_size,
+        num_hash_f,
+        hashtable_capacity
+      );
+      count_sort_step3_cuda_kernel<<<blocks_step13, threads_step13>>>(
+        key_mask_ptr,
+        key_hash_code_ptr,
+        count_sort_table_ptr,
+        key_sorted_idxes_ptr,
+        batch_size,
+        num_hash_f,
+        hashtable_capacity,
+        num_key
+      );
+    }
+    {
+      dim3 threads(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f));
+      dim3 blocks(num_query / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size);
+      extract_query_info_cuda_kernel<<<blocks, threads>>>(
+        query_mask_ptr,
+        query_hash_code_ptr,
+        count_sort_table_ptr,
+        query_info_ptr,
+        batch_size,
+        num_hash_f,
+        hashtable_capacity,
+        num_query
+      );
+    }
+    {
+      dim3 threads(WARP_SIZE, OPTIMAL_THREADS_PER_BLOCK / WARP_SIZE);
+      dim3 blocks(num_query, num_hash_f, batch_size);
+      int shared_mem = (weight_dim + WARP_SIZE) * sizeof(float);
+      lsh_weighted_cumulation_ver2_step2_cuda_kernel<<<blocks, threads, shared_mem>>>(
+        query_mask_ptr,
+        query_info_ptr,
+        key_sorted_idxes_ptr,
+        query_weight_ptr,
+        key_weight_ptr,
+        value_ptr,
+        cumulation_value_ptr,
+        batch_size,
+        num_hash_f,
+        num_query,
+        num_key,
+        value_dim,
+        weight_dim
+      );
+    }
+  }
+
+  return cumulation_value;
+
+}
+
+at::Tensor lsh_weighted_cumulation_ver3_kernel(
+  at::Tensor query_mask,
+  at::Tensor query_hash_code,
+  at::Tensor query_weight,
+  at::Tensor key_mask,
+  at::Tensor key_hash_code,
+  at::Tensor key_weight,
+  at::Tensor value,
+  int hashtable_capacity,
+  bool use_cuda
+) {
+
+  int batch_size = query_hash_code.size(0);
+  int num_hash_f = query_hash_code.size(2);
+
+  int num_query = query_hash_code.size(1);
+  int num_key = key_hash_code.size(1);
+  int value_dim = value.size(2);
+  int weight_dim = query_weight.size(2);
+
+  at::Tensor count_sort_table = at::zeros({batch_size, num_hash_f, hashtable_capacity}, query_hash_code.options());
+  at::Tensor query_sorted_idxes = at::zeros({batch_size, num_hash_f, num_query}, query_hash_code.options());
+  at::Tensor key_info = at::zeros({batch_size, num_key, 2, num_hash_f}, query_hash_code.options());
+  at::Tensor cumulation_value = at::zeros({batch_size, num_query, value_dim}, value.options());
+
+  if (use_cuda) {
+
+    int *query_mask_ptr = query_mask.data_ptr<int>();
+    int *query_hash_code_ptr = query_hash_code.data_ptr<int>();
+    float *query_weight_ptr = query_weight.data_ptr<float>();
+    int *key_mask_ptr = key_mask.data_ptr<int>();
+    int *key_hash_code_ptr = key_hash_code.data_ptr<int>();
+    float *key_weight_ptr = key_weight.data_ptr<float>();
+    float *value_ptr = value.data_ptr<float>();
+
+    int *count_sort_table_ptr = count_sort_table.data_ptr<int>();
+    int *query_sorted_idxes_ptr = query_sorted_idxes.data_ptr<int>();
+    int *key_info_ptr = key_info.data_ptr<int>();
+
+    float *cumulation_value_ptr = cumulation_value.data_ptr<float>();
+
+    {
+      dim3 threads_step13(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f));
+      dim3 blocks_step13(num_query / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size);
+      dim3 threads_step2(min(hashtable_capacity, OPTIMAL_THREADS_PER_BLOCK));
+      dim3 blocks_step2(num_hash_f, batch_size);
+      int shared_mem = hashtable_capacity * sizeof(float);
+      count_sort_step1_cuda_kernel<<<blocks_step13, threads_step13>>>(
+        query_mask_ptr,
+        query_hash_code_ptr,
+        count_sort_table_ptr,
+        batch_size,
+        num_hash_f,
+        hashtable_capacity,
+        num_query
+      );
+      count_sort_step2_cuda_kernel<<<blocks_step2, threads_step2, shared_mem>>>(
+        count_sort_table_ptr,
+        batch_size,
+        num_hash_f,
+        hashtable_capacity
+      );
+      count_sort_step3_cuda_kernel<<<blocks_step13, threads_step13>>>(
+        query_mask_ptr,
+        query_hash_code_ptr,
+        count_sort_table_ptr,
+        query_sorted_idxes_ptr,
+        batch_size,
+        num_hash_f,
+        hashtable_capacity,
+        num_query
+      );
+    }
+    {
+      dim3 threads(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f));
+      dim3 blocks(num_key / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size);
+      extract_query_info_cuda_kernel<<<blocks, threads>>>(
+        key_mask_ptr,
+        key_hash_code_ptr,
+        count_sort_table_ptr,
+        key_info_ptr,
+        batch_size,
+        num_hash_f,
+        hashtable_capacity,
+        num_key
+      );
+    }
+    {
+      dim3 threads(WARP_SIZE, OPTIMAL_THREADS_PER_BLOCK / WARP_SIZE);
+      dim3 blocks(num_key, num_hash_f, batch_size);
+      int shared_mem = (weight_dim + value_dim + WARP_SIZE) * sizeof(float);
+      lsh_weighted_cumulation_ver3_step2_cuda_kernel<<<blocks, threads, shared_mem>>>(
+        query_sorted_idxes_ptr,
+        key_mask_ptr,
+        key_info_ptr,
+        query_weight_ptr,
+        key_weight_ptr,
+        value_ptr,
+        cumulation_value_ptr,
+        batch_size,
+        num_hash_f,
+        num_query,
+        num_key,
+        value_dim,
+        weight_dim
+      );
+    }
+  }
+
+  return cumulation_value;
+
+}
+
+at::Tensor lsh_weighted_cumulation_ver4_kernel(
+  at::Tensor query_mask,
+  at::Tensor query_hash_code,
+  at::Tensor query_weight,
+  at::Tensor key_mask,
+  at::Tensor key_hash_code,
+  at::Tensor key_weight,
+  at::Tensor value,
+  int hashtable_capacity,
+  bool use_cuda
+) {
+
+  int batch_size = query_hash_code.size(0);
+  int num_hash_f = query_hash_code.size(2);
+
+  int num_query = query_hash_code.size(1);
+  int num_key = key_hash_code.size(1);
+  int value_dim = value.size(2);
+  int weight_dim = query_weight.size(2);
+
+  at::Tensor count_sort_table = at::zeros({batch_size, num_hash_f, hashtable_capacity}, query_hash_code.options());
+  at::Tensor query_sorted_idxes = at::zeros({batch_size, num_hash_f, num_query}, query_hash_code.options());
+  at::Tensor key_info = at::zeros({batch_size, num_key, 2, num_hash_f}, query_hash_code.options());
+  at::Tensor cumulation_value = at::zeros({batch_size, num_query, value_dim}, value.options());
+
+  if (use_cuda) {
+
+    int *query_mask_ptr = query_mask.data_ptr<int>();
+    int *query_hash_code_ptr = query_hash_code.data_ptr<int>();
+    float *query_weight_ptr = query_weight.data_ptr<float>();
+    int *key_mask_ptr = key_mask.data_ptr<int>();
+    int *key_hash_code_ptr = key_hash_code.data_ptr<int>();
+    float *key_weight_ptr = key_weight.data_ptr<float>();
+    float *value_ptr = value.data_ptr<float>();
+
+    int *count_sort_table_ptr = count_sort_table.data_ptr<int>();
+    int *query_sorted_idxes_ptr = query_sorted_idxes.data_ptr<int>();
+    int *key_info_ptr = key_info.data_ptr<int>();
+
+    float *cumulation_value_ptr = cumulation_value.data_ptr<float>();
+
+    {
+      dim3 threads_step13(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f));
+      dim3 blocks_step13(num_query / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size);
+      dim3 threads_step2(min(hashtable_capacity, OPTIMAL_THREADS_PER_BLOCK));
+      dim3 blocks_step2(num_hash_f, batch_size);
+      int shared_mem = hashtable_capacity * sizeof(float);
+      count_sort_step1_cuda_kernel<<<blocks_step13, threads_step13>>>(
+        query_mask_ptr,
+        query_hash_code_ptr,
+        count_sort_table_ptr,
+        batch_size,
+        num_hash_f,
+        hashtable_capacity,
+        num_query
+      );
+      count_sort_step2_cuda_kernel<<<blocks_step2, threads_step2, shared_mem>>>(
+        count_sort_table_ptr,
+        batch_size,
+        num_hash_f,
+        hashtable_capacity
+      );
+      count_sort_step3_cuda_kernel<<<blocks_step13, threads_step13>>>(
+        query_mask_ptr,
+        query_hash_code_ptr,
+        count_sort_table_ptr,
+        query_sorted_idxes_ptr,
+        batch_size,
+        num_hash_f,
+        hashtable_capacity,
+        num_query
+      );
+    }
+    {
+      dim3 threads(num_hash_f, max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f));
+      dim3 blocks(num_key / max(1, OPTIMAL_THREADS_PER_BLOCK / num_hash_f), batch_size);
+      extract_query_info_cuda_kernel<<<blocks, threads>>>(
+        key_mask_ptr,
+        key_hash_code_ptr,
+        count_sort_table_ptr,
+        key_info_ptr,
+        batch_size,
+        num_hash_f,
+        hashtable_capacity,
+        num_key
+      );
+    }
+    {
+      dim3 threads(WARP_SIZE, OPTIMAL_THREADS_PER_BLOCK / WARP_SIZE);
+      dim3 blocks(num_key, batch_size);
+      int shared_mem = (weight_dim + value_dim + 2 * num_hash_f) * sizeof(float);
+      lsh_weighted_cumulation_ver4_step2_cuda_kernel<<<blocks, threads, shared_mem>>>(
+        query_sorted_idxes_ptr,
+        key_mask_ptr,
+        key_info_ptr,
+        query_weight_ptr,
+        key_weight_ptr,
+        value_ptr,
+        cumulation_value_ptr,
+        batch_size,
+        num_hash_f,
+        num_query,
+        num_key,
+        value_dim,
+        weight_dim
+      );
+    }
+  }
+
+  return cumulation_value;
+
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd48de0ed159f49ee3afe93b12aaae719fe87688
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation.h
@@ -0,0 +1,71 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <vector>
+
+std::vector<at::Tensor> fast_hash_ver1_kernel(
+  at::Tensor query_mask,
+  at::Tensor query_vector,
+  at::Tensor key_mask,
+  at::Tensor key_vector,
+  int num_hash_f,
+  int hash_code_len,
+  bool use_cuda
+);
+
+at::Tensor lsh_cumulation_ver1_kernel(
+  at::Tensor query_mask,
+  at::Tensor query_hash_code,
+  at::Tensor key_mask,
+  at::Tensor key_hash_code,
+  at::Tensor value,
+  int hashtable_capacity,
+  bool use_cuda
+);
+
+at::Tensor lsh_weighted_cumulation_ver1_kernel(
+  at::Tensor query_mask,
+  at::Tensor query_hash_code,
+  at::Tensor query_weight,
+  at::Tensor key_mask,
+  at::Tensor key_hash_code,
+  at::Tensor key_weight,
+  at::Tensor value,
+  int hashtable_capacity,
+  bool use_cuda
+);
+
+at::Tensor lsh_weighted_cumulation_ver2_kernel(
+  at::Tensor query_mask,
+  at::Tensor query_hash_code,
+  at::Tensor query_weight,
+  at::Tensor key_mask,
+  at::Tensor key_hash_code,
+  at::Tensor key_weight,
+  at::Tensor value,
+  int hashtable_capacity,
+  bool use_cuda
+);
+
+at::Tensor lsh_weighted_cumulation_ver3_kernel(
+  at::Tensor query_mask,
+  at::Tensor query_hash_code,
+  at::Tensor query_weight,
+  at::Tensor key_mask,
+  at::Tensor key_hash_code,
+  at::Tensor key_weight,
+  at::Tensor value,
+  int hashtable_capacity,
+  bool use_cuda
+);
+
+at::Tensor lsh_weighted_cumulation_ver4_kernel(
+  at::Tensor query_mask,
+  at::Tensor query_hash_code,
+  at::Tensor query_weight,
+  at::Tensor key_mask,
+  at::Tensor key_hash_code,
+  at::Tensor key_weight,
+  at::Tensor value,
+  int hashtable_capacity,
+  bool use_cuda
+);
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_cuda.cu b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..22944e97044659f896451936c6253d5aadd7a769
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_cuda.cu
@@ -0,0 +1,825 @@
+// File from https://github.com/mlpen/YOSO/blob/main/encoders/backbones/efficient_attentions/yoso/yoso_v1/cuda/fast_lsh_cumulation_cuda.cu
+
+#include "fast_lsh_cumulation_cuda.h"
+#include "common_cuda_device.h"
+#include "common_cuda.h"
+#include "common.h"
+#include <stdio.h>
+//////////////////////////////////////////////////////////////////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ void fast_hadamard_transform(float *vector_buffer, int vector_dim, int dim_idx) {
+  int stride = vector_dim / 2;
+  while (stride > (WARP_SIZE / 2)) {
+    __syncthreads();
+    int sign = 1 - ((dim_idx / stride) % 2) * 2;
+    float val1 = vector_buffer[dim_idx];
+    float val2 = vector_buffer[dim_idx + sign * stride];
+    __syncthreads();
+    vector_buffer[dim_idx] = float(sign) * val1 + val2;
+    stride = stride / 2;
+  }
+
+  float val = vector_buffer[dim_idx];
+  #pragma unroll
+  for (stride = (WARP_SIZE / 2); stride > 0; stride = stride / 2) {
+    int sign = 1 - ((dim_idx / stride) % 2) * 2;
+    val = float(sign) * val + __shfl_xor_sync(FULL_MASK, val, stride);
+  }
+  vector_buffer[dim_idx] = val;
+}
+
+__global__ void fast_hash_ver1_cuda_kernel(
+  int *mask,        // [batch_size, num_vector]
+  float *vector,    // [batch_size, num_vector, vector_dim]
+  int *Dmat,        // [batch_size, 3, num_part, vector_dim]
+  int *hash_code,   // [batch_size, num_vector, num_hash_f]
+  int batch_size,
+  int num_vector,
+  int vector_dim,
+  int num_part,
+  int num_hash_f,
+  int hash_code_len
+) {
+
+  int batch_idx = blockIdx.z;
+  int vector_idx = blockIdx.y;
+  int part_idx = blockIdx.x;
+
+  int dim_idx = threadIdx.x;
+
+  int batch_idx__vector_idx = batch_idx * num_vector + vector_idx;
+  if (mask[batch_idx__vector_idx] == 0) {
+    return;
+  }
+
+  extern __shared__ float buffer[];
+  float *vector_buffer = buffer;
+
+  vector_buffer[dim_idx] = vector[batch_idx__vector_idx * vector_dim + dim_idx];
+
+  vector_buffer[dim_idx] = vector_buffer[dim_idx] * (float)Dmat[((batch_idx * 3 + 0) * num_part + part_idx) * vector_dim + dim_idx];
+  fast_hadamard_transform(vector_buffer, vector_dim, dim_idx);
+  vector_buffer[dim_idx] = vector_buffer[dim_idx] * (float)Dmat[((batch_idx * 3 + 1) * num_part + part_idx) * vector_dim + dim_idx];
+  fast_hadamard_transform(vector_buffer, vector_dim, dim_idx);
+  vector_buffer[dim_idx] = vector_buffer[dim_idx] * (float)Dmat[((batch_idx * 3 + 2) * num_part + part_idx) * vector_dim + dim_idx];
+  fast_hadamard_transform(vector_buffer, vector_dim, dim_idx);
+
+  int num_hash_per_part = vector_dim / hash_code_len;
+  if (hash_code_len == 8 || hash_code_len == 16) {
+    int code = select(vector_buffer[dim_idx] > 0, 1 << (dim_idx % hash_code_len), 0);
+    for (int offset = 1; offset < hash_code_len; offset = offset * 2) {
+      code += __shfl_xor_sync(FULL_MASK, code, offset);
+    }
+    if (dim_idx % hash_code_len == 0) {
+      int hash_f_idx = part_idx * num_hash_per_part + dim_idx / hash_code_len;
+      if (hash_f_idx < num_hash_f) {
+        hash_code[batch_idx__vector_idx * num_hash_f + hash_f_idx] = code;
+      }
+    }
+  } else {
+    vector_buffer[dim_idx] = select(vector_buffer[dim_idx] > 0, 1 << (dim_idx % hash_code_len), 0);
+    __syncthreads();
+    if (dim_idx < num_hash_per_part) {
+      int code = 0;
+      for (int i = 0; i < hash_code_len; i++) {
+        code += vector_buffer[dim_idx * hash_code_len + i];
+      }
+      int hash_f_idx = part_idx * num_hash_per_part + dim_idx;
+      if (hash_f_idx < num_hash_f) {
+        hash_code[batch_idx__vector_idx * num_hash_f + hash_f_idx] = code;
+      }
+    }
+  }
+}
+
+__global__ void lsh_cumulation_ver1_step1_cuda_kernel(
+  int *key_mask,           // [batch_size, num_key]
+  int *key_hash_code,      // [batch_size, num_key, num_hash_f]
+  float *value,            // [batch_size, num_key, value_dim]
+  float *hashtable_value,  // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity,
+  int num_key,
+  int value_dim,
+  int offset_warp
+) {
+
+  int warp_thread_idx = threadIdx.x;
+
+  int batch_idx = blockIdx.y;
+  int key_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  int batch_idx__key_idx = batch_idx * num_key + key_idx;
+  if (key_mask[batch_idx__key_idx] == 0) {
+    return;
+  }
+
+  if (num_hash_f > WARP_SIZE) {
+    float warp_value = value[batch_idx__key_idx * value_dim + offset_warp + warp_thread_idx];
+    for (int hash_f_start = 0; hash_f_start < num_hash_f; hash_f_start = hash_f_start + WARP_SIZE) {
+      int warp_hashcode = key_hash_code[batch_idx__key_idx * num_hash_f + hash_f_start + warp_thread_idx];
+      #pragma unroll
+      for (int hash_f_offset = 0; hash_f_offset < WARP_SIZE; hash_f_offset++) {
+        int current_hashcode = warp_hashcode;
+        current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_offset);
+        int hashtable_idx = (batch_idx * num_hash_f + (hash_f_start + hash_f_offset)) * hashtable_capacity + current_hashcode;
+        atomicAdd(&hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx], warp_value);
+      }
+    }
+  } else {
+    float warp_value = value[batch_idx__key_idx * value_dim + offset_warp + warp_thread_idx];
+    int warp_hashcode = 0;
+    if (warp_thread_idx < num_hash_f) {
+      warp_hashcode = key_hash_code[batch_idx__key_idx * num_hash_f + warp_thread_idx];
+    }
+    for (int hash_f_idx = 0; hash_f_idx < num_hash_f; hash_f_idx++) {
+      int current_hashcode = warp_hashcode;
+      current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_idx);
+      int hashtable_idx = (batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + current_hashcode;
+      atomicAdd(&hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx], warp_value);
+    }
+  }
+
+}
+
+__global__ void lsh_cumulation_ver1_step2_cuda_kernel(
+  int *query_mask,         // [batch_size, num_query]
+  int *query_hash_code,    // [batch_size, num_query, num_hash_f]
+  float *hashtable_value,  // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE]
+  float *cumulation_value, // [batch_size, num_query, value_dim]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity,
+  int num_query,
+  int value_dim,
+  int offset_warp
+) {
+
+  int warp_thread_idx = threadIdx.x;
+
+  int batch_idx = blockIdx.y;
+  int query_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  int batch_idx__query_idx = batch_idx * num_query + query_idx;
+  if (query_mask[batch_idx__query_idx] == 0) {
+    return;
+  }
+
+  if (num_hash_f > WARP_SIZE) {
+    float warp_value = 0;
+    for (int hash_f_start = 0; hash_f_start < num_hash_f; hash_f_start = hash_f_start + WARP_SIZE) {
+      int warp_hashcode = query_hash_code[batch_idx__query_idx * num_hash_f + hash_f_start + warp_thread_idx];
+      #pragma unroll
+      for (int hash_f_offset = 0; hash_f_offset < WARP_SIZE; hash_f_offset++) {
+        int current_hashcode = warp_hashcode;
+        current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_offset);
+        int hashtable_idx = (batch_idx * num_hash_f + (hash_f_start + hash_f_offset)) * hashtable_capacity + current_hashcode;
+        warp_value = warp_value + hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx];
+      }
+    }
+    cumulation_value[batch_idx__query_idx * value_dim + offset_warp + warp_thread_idx] = warp_value / float(num_hash_f);
+  } else {
+    float warp_value = 0;
+    int warp_hashcode = 0;
+    if (warp_thread_idx < num_hash_f) {
+      warp_hashcode = query_hash_code[batch_idx__query_idx * num_hash_f + warp_thread_idx];
+    }
+    for (int hash_f_idx = 0; hash_f_idx < num_hash_f; hash_f_idx++) {
+      int current_hashcode = warp_hashcode;
+      current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_idx);
+      int hashtable_idx = (batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + current_hashcode;
+      warp_value = warp_value + hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx];
+    }
+    cumulation_value[batch_idx__query_idx * value_dim + offset_warp + warp_thread_idx] = warp_value / float(num_hash_f);
+  }
+
+}
+
+__global__ void lsh_weighted_cumulation_ver1_step1_cuda_kernel(
+  int *key_mask,            // [batch_size, num_key]
+  int *key_hash_code,       // [batch_size, num_key, num_hash_f]
+  float *key_weight,        // [batch_size, num_key, weight_dim]
+  float *value,             // [batch_size, num_key, value_dim]
+  float *hashtable_value,   // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity,
+  int num_key,
+  int value_dim,
+  int weight_dim,
+  int offset_warp,
+  int weight_idx
+) {
+
+  int warp_thread_idx = threadIdx.x;
+
+  int batch_idx = blockIdx.y;
+  int key_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  int batch_idx__key_idx = batch_idx * num_key + key_idx;
+  if (key_mask[batch_idx__key_idx] == 0) {
+    return;
+  }
+
+  if (num_hash_f > WARP_SIZE) {
+    float warp_value = key_weight[batch_idx__key_idx * weight_dim + weight_idx] * value[batch_idx__key_idx * value_dim + offset_warp + warp_thread_idx];
+    for (int hash_f_start = 0; hash_f_start < num_hash_f; hash_f_start = hash_f_start + WARP_SIZE) {
+      int warp_hashcode = key_hash_code[batch_idx__key_idx * num_hash_f + hash_f_start + warp_thread_idx];
+      #pragma unroll
+      for (int hash_f_offset = 0; hash_f_offset < WARP_SIZE; hash_f_offset++) {
+        int current_hashcode = warp_hashcode;
+        current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_offset);
+        int hashtable_idx = (batch_idx * num_hash_f + (hash_f_start + hash_f_offset)) * hashtable_capacity + current_hashcode;
+        atomicAdd(&hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx], warp_value);
+      }
+    }
+  } else {
+    float warp_value = key_weight[batch_idx__key_idx * weight_dim + weight_idx] * value[batch_idx__key_idx * value_dim + offset_warp + warp_thread_idx];
+    int warp_hashcode = 0;
+    if (warp_thread_idx < num_hash_f) {
+      warp_hashcode = key_hash_code[batch_idx__key_idx * num_hash_f + warp_thread_idx];
+    }
+    for (int hash_f_idx = 0; hash_f_idx < num_hash_f; hash_f_idx++) {
+      int current_hashcode = warp_hashcode;
+      current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_idx);
+      int hashtable_idx = (batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + current_hashcode;
+      atomicAdd(&hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx], warp_value);
+    }
+  }
+
+}
+
+__global__ void lsh_weighted_cumulation_ver1_step2_cuda_kernel(
+  int *query_mask,          // [batch_size, num_query]
+  int *query_hash_code,     // [batch_size, num_query, num_hash_f]
+  float *query_weight,      // [batch_size, num_query, weight_dim]
+  float *hashtable_value,   // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE]
+  float *cumulation_value,  // [batch_size, num_query, value_dim]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity,
+  int num_query,
+  int value_dim,
+  int weight_dim,
+  int offset_warp,
+  int weight_idx
+) {
+
+  int warp_thread_idx = threadIdx.x;
+
+  int batch_idx = blockIdx.y;
+  int query_idx = blockIdx.x * blockDim.y + threadIdx.y;
+
+  int batch_idx__query_idx = batch_idx * num_query + query_idx;
+  if (query_mask[batch_idx__query_idx] == 0) {
+    return;
+  }
+
+  if (num_hash_f > WARP_SIZE) {
+    float warp_value = 0;
+    for (int hash_f_start = 0; hash_f_start < num_hash_f; hash_f_start = hash_f_start + WARP_SIZE) {
+      int warp_hashcode = query_hash_code[batch_idx__query_idx * num_hash_f + hash_f_start + warp_thread_idx];
+      #pragma unroll
+      for (int hash_f_offset = 0; hash_f_offset < WARP_SIZE; hash_f_offset++) {
+        int current_hashcode = warp_hashcode;
+        current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_offset);
+        int hashtable_idx = (batch_idx * num_hash_f + (hash_f_start + hash_f_offset)) * hashtable_capacity + current_hashcode;
+        warp_value = warp_value + hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx];
+      }
+    }
+    float warp_weight = query_weight[batch_idx__query_idx * weight_dim + weight_idx];
+    cumulation_value[batch_idx__query_idx * value_dim + offset_warp + warp_thread_idx] += warp_weight * warp_value / float(num_hash_f);
+  } else {
+    float warp_value = 0;
+    int warp_hashcode = 0;
+    if (warp_thread_idx < num_hash_f) {
+      warp_hashcode = query_hash_code[batch_idx__query_idx * num_hash_f + warp_thread_idx];
+    }
+    for (int hash_f_idx = 0; hash_f_idx < num_hash_f; hash_f_idx++) {
+      int current_hashcode = warp_hashcode;
+      current_hashcode = __shfl_sync(FULL_MASK, current_hashcode, hash_f_idx);
+      int hashtable_idx = (batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + current_hashcode;
+      warp_value = warp_value + hashtable_value[hashtable_idx * WARP_SIZE + warp_thread_idx];
+    }
+    float warp_weight = query_weight[batch_idx__query_idx * weight_dim + weight_idx];
+    cumulation_value[batch_idx__query_idx * value_dim + offset_warp + warp_thread_idx] += warp_weight * warp_value / float(num_hash_f);
+  }
+
+}
+
+__global__ void count_sort_step1_cuda_kernel(
+  int *key_mask,         // [batch_size, num_key]
+  int *key_hash_code,    // [batch_size, num_key, num_hash_f]
+  int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity,
+  int num_key
+) {
+
+  int batch_idx = blockIdx.y;
+  int key_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  int hash_f_idx = threadIdx.x;
+
+  int batch_idx__key_idx = batch_idx * num_key + key_idx;
+  if (key_mask[batch_idx__key_idx] == 0) {
+    return;
+  }
+
+  int hash_code = key_hash_code[batch_idx__key_idx * num_hash_f + hash_f_idx];
+  atomicAdd(&count_sort_table[(batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + hash_code], 1);
+
+}
+
+__global__ void count_sort_step2_cuda_kernel(
+  int *count_sort_table,  // [batch_size, num_hash_f, hashtable_capacity]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity
+) {
+
+  int batch_idx = blockIdx.y;
+  int hash_f_idx = blockIdx.x;
+
+  int num_threads = blockDim.x;
+  int thread_id = threadIdx.x;
+
+  int batch_idx__hash_f_idx = batch_idx * num_hash_f + hash_f_idx;
+
+  extern __shared__ float buffer[];
+  int *table_buffer = (int*)buffer;
+
+  if (thread_id == 0) {
+    table_buffer[0] = 0;
+  }
+  copy_data<int>(&count_sort_table[batch_idx__hash_f_idx * hashtable_capacity], &table_buffer[1], hashtable_capacity - 1, num_threads, thread_id);
+
+  for (int table_idx_start = 0; table_idx_start < hashtable_capacity; table_idx_start = table_idx_start + num_threads) {
+    int thread_value = table_buffer[table_idx_start + thread_id];
+    int next_thread_value = 0;
+    for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) {
+      next_thread_value = __shfl_up_sync(FULL_MASK, thread_value, offset);
+      if (thread_id % WARP_SIZE >= offset) {
+        thread_value = thread_value + next_thread_value;
+      }
+    }
+    table_buffer[table_idx_start + thread_id] = thread_value;
+  }
+  __syncthreads();
+
+  if (hashtable_capacity > WARP_SIZE) {
+    if (thread_id < WARP_SIZE) {
+      for (int table_idx_start = WARP_SIZE; table_idx_start < hashtable_capacity; table_idx_start = table_idx_start + WARP_SIZE) {
+        table_buffer[table_idx_start + thread_id] += table_buffer[table_idx_start - 1];
+      }
+    }
+  }
+
+  copy_data<int>(table_buffer, &count_sort_table[batch_idx__hash_f_idx * hashtable_capacity], hashtable_capacity, num_threads, thread_id);
+
+}
+
+
+__global__ void count_sort_step3_cuda_kernel(
+  int *key_mask,          // [batch_size, num_key]
+  int *key_hash_code,     // [batch_size, num_key, num_hash_f]
+  int *count_sort_table,  // [batch_size, num_hash_f, hashtable_capacity]
+  int *key_sorted_idxes,  // [batch_size, num_hash_f, num_key]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity,
+  int num_key
+) {
+
+  int batch_idx = blockIdx.y;
+  int key_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  int hash_f_idx = threadIdx.x;
+
+  int batch_idx__key_idx = batch_idx * num_key + key_idx;
+  if (key_mask[batch_idx__key_idx] == 0) {
+    return;
+  }
+
+  int batch_idx__hash_f_idx = batch_idx * num_hash_f + hash_f_idx;
+
+  int hash_code = key_hash_code[batch_idx__key_idx * num_hash_f + hash_f_idx];
+  int sort_idx = atomicAdd(&count_sort_table[batch_idx__hash_f_idx * hashtable_capacity + hash_code], 1);
+  key_sorted_idxes[batch_idx__hash_f_idx * num_key + sort_idx] = key_idx;
+
+}
+
+__global__ void extract_query_info_cuda_kernel(
+  int *query_mask,       // [batch_size, num_query]
+  int *query_hash_code,  // [batch_size, num_query, num_hash_f]
+  int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity]
+  int *query_info,       // [batch_size, num_query, 2, num_hash_f]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity,
+  int num_query
+) {
+
+  int batch_idx = blockIdx.y;
+  int query_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  int hash_f_idx = threadIdx.x;
+
+  int batch_idx__query_idx = batch_idx * num_query + query_idx;
+  if (query_mask[batch_idx__query_idx] == 0) {
+    return;
+  }
+
+  int hash_code = query_hash_code[batch_idx__query_idx * num_hash_f + hash_f_idx];
+  int batch_idx__hash_f_idx__hash_code = (batch_idx * num_hash_f + hash_f_idx) * hashtable_capacity + hash_code;
+
+  int key_offset = select(hash_code == 0, 0, count_sort_table[batch_idx__hash_f_idx__hash_code - 1]);
+  int key_count = count_sort_table[batch_idx__hash_f_idx__hash_code] - key_offset;
+
+  query_info[batch_idx__query_idx * 2 * num_hash_f + hash_f_idx] = key_offset;
+  query_info[(batch_idx__query_idx * 2 + 1) * num_hash_f + hash_f_idx] = key_count;
+
+}
+
+__global__ void lsh_weighted_cumulation_ver2_step2_cuda_kernel(
+  int *query_mask,         // [batch_size, num_query]
+  int *query_info,         // [batch_size, num_query, 2, num_hash_f]
+  int *key_sorted_idxes,   // [batch_size, num_hash_f, num_key]
+  float *query_weight,     // [batch_size, num_query, weight_dim]
+  float *key_weight,       // [batch_size, num_key, weight_dim]
+  float *value,            // [batch_size, num_key, value_dim]
+  float *cumulation_value, // [batch_size, num_query, value_dim]
+  int batch_size,
+  int num_hash_f,
+  int num_query,
+  int num_key,
+  int value_dim,
+  int weight_dim
+) {
+
+  int batch_idx = blockIdx.z;
+  int hash_f_idx = blockIdx.y;
+  int query_idx = blockIdx.x;
+
+  int num_threads = blockDim.y * blockDim.x;
+  int thread_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+  int num_warps = blockDim.y;
+  int warp_idx = threadIdx.y;
+  int warp_thread_idx = threadIdx.x;
+
+  int batch_idx__query_idx = batch_idx * num_query + query_idx;
+  if (query_mask[batch_idx__query_idx] == 0) {
+    return;
+  }
+
+  int key_offset = query_info[batch_idx__query_idx * 2 * num_hash_f + hash_f_idx];
+  int key_count = query_info[(batch_idx__query_idx * 2 + 1) * num_hash_f + hash_f_idx];
+
+  if (key_count == 0) {
+    return;
+  }
+
+  extern __shared__ float buffer[];
+
+  if (key_count == 1) {
+    if (warp_idx == 0) {
+      int key_idx = key_sorted_idxes[(batch_idx * num_hash_f + hash_f_idx) * num_key + key_offset];
+      int batch_idx__key_idx = batch_idx * num_key + key_idx;
+      float weight = 0;
+      for (int weight_offset = 0; weight_offset < weight_dim; weight_offset = weight_offset + WARP_SIZE) {
+        int weight_dim_idx = weight_offset + warp_thread_idx;
+        float val = query_weight[batch_idx__query_idx * weight_dim + weight_dim_idx] * key_weight[batch_idx__key_idx * weight_dim + weight_dim_idx];
+        #pragma unroll
+        for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) {
+          val += __shfl_xor_sync(FULL_MASK, val, offset);
+        }
+        weight = weight + val;
+      }
+      weight = weight / float(num_hash_f);
+      for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) {
+        int value_dim_idx = value_offset + warp_thread_idx;
+        float val = value[batch_idx__key_idx * value_dim + value_dim_idx];
+        atomicAdd(&cumulation_value[batch_idx__query_idx * value_dim + value_dim_idx], weight * val);
+      }
+    }
+  } else {
+    float *weight_buffer = buffer;
+    int *key_idxes_buffer = (int*)&buffer[weight_dim];
+
+    copy_data_nonblocking<float>(&query_weight[batch_idx__query_idx * weight_dim], weight_buffer, weight_dim, num_threads, thread_id);
+
+    while (key_count > 0) {
+      int work_size = min(WARP_SIZE, key_count);
+      copy_data_nonblocking<int>(&key_sorted_idxes[(batch_idx * num_hash_f + hash_f_idx) * num_key + key_offset], key_idxes_buffer, work_size, num_threads, thread_id);
+      __syncthreads();
+      for (int work_offset = 0; work_offset < WARP_SIZE; work_offset = work_offset + num_warps) {
+        int work_idx = work_offset + warp_idx;
+        if (work_idx < key_count) {
+          int key_idx = key_idxes_buffer[work_idx];
+          int batch_idx__key_idx = batch_idx * num_key + key_idx;
+          float weight = 0;
+          for (int weight_offset = 0; weight_offset < weight_dim; weight_offset = weight_offset + WARP_SIZE) {
+            int weight_dim_idx = weight_offset + warp_thread_idx;
+            float val = weight_buffer[weight_dim_idx] * key_weight[batch_idx__key_idx * weight_dim + weight_dim_idx];
+            #pragma unroll
+            for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) {
+              val += __shfl_xor_sync(FULL_MASK, val, offset);
+            }
+            weight = weight + val;
+          }
+          weight = weight / float(num_hash_f);
+          for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) {
+            int value_dim_idx = value_offset + warp_thread_idx;
+            float val = value[batch_idx__key_idx * value_dim + value_dim_idx];
+            atomicAdd(&cumulation_value[batch_idx__query_idx * value_dim + value_dim_idx], weight * val);
+          }
+        }
+      }
+      key_count = key_count - work_size;
+      key_offset = key_offset + work_size;
+    }
+  }
+
+}
+
+__global__ void lsh_weighted_cumulation_ver3_step2_cuda_kernel(
+  int *query_sorted_idxes,   // [batch_size, num_hash_f, num_query]
+  int *key_mask,             // [batch_size, num_key]
+  int *key_info,             // [batch_size, num_key, 2, num_hash_f]
+  float *query_weight,       // [batch_size, num_query, weight_dim]
+  float *key_weight,         // [batch_size, num_key, weight_dim]
+  float *value,              // [batch_size, num_key, value_dim]
+  float *cumulation_value,   // [batch_size, num_query, value_dim]
+  int batch_size,
+  int num_hash_f,
+  int num_query,
+  int num_key,
+  int value_dim,
+  int weight_dim
+) {
+
+  int batch_idx = blockIdx.z;
+  int hash_f_idx = blockIdx.y;
+  int key_idx = blockIdx.x;
+
+  int num_threads = blockDim.y * blockDim.x;
+  int thread_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+  int num_warps = blockDim.y;
+  int warp_idx = threadIdx.y;
+  int warp_thread_idx = threadIdx.x;
+
+  int batch_idx__key_idx = batch_idx * num_key + key_idx;
+  if (key_mask[batch_idx__key_idx] == 0) {
+    return;
+  }
+
+  int query_offset = key_info[batch_idx__key_idx * 2 * num_hash_f + hash_f_idx];
+  int query_count = key_info[(batch_idx__key_idx * 2 + 1) * num_hash_f + hash_f_idx];
+
+  if (query_count == 0) {
+    return;
+  }
+
+  extern __shared__ float buffer[];
+
+  if (query_count == 1) {
+    if (warp_idx == 0) {
+      int query_idx = query_sorted_idxes[(batch_idx * num_hash_f + hash_f_idx) * num_query + query_offset];
+      int batch_idx__query_idx = batch_idx * num_query + query_idx;
+      float weight = 0;
+      for (int weight_offset = 0; weight_offset < weight_dim; weight_offset = weight_offset + WARP_SIZE) {
+        int weight_dim_idx = weight_offset + warp_thread_idx;
+        float val = key_weight[batch_idx__key_idx * weight_dim + weight_dim_idx] * query_weight[batch_idx__query_idx * weight_dim + weight_dim_idx];
+        #pragma unroll
+        for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) {
+          val += __shfl_xor_sync(FULL_MASK, val, offset);
+        }
+        weight = weight + val;
+      }
+      weight = weight / float(num_hash_f);
+      for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) {
+        int value_dim_idx = value_offset + warp_thread_idx;
+        float val = value[batch_idx__key_idx * value_dim + value_dim_idx];
+        atomicAdd(&cumulation_value[batch_idx__query_idx * value_dim + value_dim_idx], weight * val);
+      }
+    }
+  } else {
+    float *weight_buffer = buffer;
+    float *value_buffer = &buffer[weight_dim];
+    int *query_idxes_buffer = (int*)&buffer[weight_dim + value_dim];
+
+    copy_data_nonblocking<float>(&key_weight[batch_idx__key_idx * weight_dim], weight_buffer, weight_dim, num_threads, thread_id);
+    copy_data_nonblocking<float>(&value[batch_idx__key_idx * value_dim], value_buffer, value_dim, num_threads, thread_id);
+
+    while (query_count > 0) {
+      int work_size = min(WARP_SIZE, query_count);
+      copy_data_nonblocking<int>(&query_sorted_idxes[(batch_idx * num_hash_f + hash_f_idx) * num_query + query_offset], query_idxes_buffer, work_size, num_threads, thread_id);
+      __syncthreads();
+      for (int work_offset = 0; work_offset < WARP_SIZE; work_offset = work_offset + num_warps) {
+        int work_idx = work_offset + warp_idx;
+        if (work_idx < query_count) {
+          int query_idx = query_idxes_buffer[work_idx];
+          int batch_idx__query_idx = batch_idx * num_query + query_idx;
+          float weight = 0;
+          for (int weight_offset = 0; weight_offset < weight_dim; weight_offset = weight_offset + WARP_SIZE) {
+            int weight_dim_idx = weight_offset + warp_thread_idx;
+            float val = weight_buffer[weight_dim_idx] * query_weight[batch_idx__query_idx * weight_dim + weight_dim_idx];
+            #pragma unroll
+            for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) {
+              val += __shfl_xor_sync(FULL_MASK, val, offset);
+            }
+            weight = weight + val;
+          }
+          weight = weight / float(num_hash_f);
+          for (int value_offset = 0; value_offset < value_dim; value_offset = value_offset + WARP_SIZE) {
+            int value_dim_idx = value_offset + warp_thread_idx;
+            float val = value_buffer[value_dim_idx];
+            atomicAdd(&cumulation_value[batch_idx__query_idx * value_dim + value_dim_idx], weight * val);
+          }
+        }
+      }
+      query_count = query_count - work_size;
+      query_offset = query_offset + work_size;
+    }
+  }
+
+}
+
+__global__ void lsh_weighted_cumulation_ver4_step2_cuda_kernel(
+  int *query_sorted_idxes,   // [batch_size, num_hash_f, num_query]
+  int *key_mask,             // [batch_size, num_key]
+  int *key_info,             // [batch_size, num_key, 2, num_hash_f]
+  float *query_weight,       // [batch_size, num_query, weight_dim]
+  float *key_weight,         // [batch_size, num_key, weight_dim]
+  float *value,              // [batch_size, num_key, value_dim]
+  float *cumulation_value,   // [batch_size, num_query, value_dim]
+  int batch_size,
+  int num_hash_f,
+  int num_query,
+  int num_key,
+  int value_dim,
+  int weight_dim
+) {
+
+  int batch_idx = blockIdx.y;
+  int key_idx = blockIdx.x;
+
+  int num_threads = blockDim.y * blockDim.x;
+  int thread_id = threadIdx.y * blockDim.x + threadIdx.x;
+
+  int num_warps = blockDim.y;
+  int warp_idx = threadIdx.y;
+  int warp_thread_idx = threadIdx.x;
+
+  int batch_idx__key_idx = batch_idx * num_key + key_idx;
+  if (key_mask[batch_idx__key_idx] == 0) {
+    return;
+  }
+
+  extern __shared__ float buffer[];
+  float *weight_buffer = buffer;
+  float *value_buffer = &buffer[weight_dim];
+  int *key_info_buffer = (int*)&buffer[weight_dim + value_dim];
+
+  copy_data_nonblocking<float>(&key_weight[batch_idx__key_idx * weight_dim], weight_buffer, weight_dim, num_threads, thread_id);
+  copy_data_nonblocking<float>(&value[batch_idx__key_idx * value_dim], value_buffer, value_dim, num_threads, thread_id);
+  copy_data_nonblocking<int>(&key_info[batch_idx__key_idx * 2 * num_hash_f], key_info_buffer, 2 * num_hash_f, num_threads, thread_id);
+
+  int *query_offset_buffer = key_info_buffer;
+  int *query_count_buffer = &key_info_buffer[num_hash_f];
+
+  const int hashtable_size = 1024 + OPTIMAL_THREADS_PER_BLOCK;
+  __shared__ int hashtable_query[hashtable_size];
+  __shared__ int hashtable_count[hashtable_size];
+  __shared__ int inserted_query[hashtable_size];
+  __shared__ int query_counter[1];
+
+  int hash_f_idx_base = 0;
+
+  while (true) {
+
+    init_buffer_nonblocking<int>(EMPTY_VALUE, hashtable_query, hashtable_size, num_threads, thread_id);
+    init_buffer_nonblocking<int>(0, hashtable_count, hashtable_size, num_threads, thread_id);
+    init_buffer_nonblocking<int>(EMPTY_VALUE, inserted_query, hashtable_size, num_threads, thread_id);
+    init_buffer_nonblocking<int>(0, query_counter, 1, num_threads, thread_id);
+    __syncthreads();
+
+    while (hash_f_idx_base < num_hash_f) {
+
+      int hash_f_idx = hash_f_idx_base + warp_idx;
+      int batch_idx__hash_f_idx = batch_idx * num_hash_f + hash_f_idx;
+
+      int stop_flag = 0;
+
+      int query_offset = query_offset_buffer[hash_f_idx];
+      int query_count = query_count_buffer[hash_f_idx];
+
+      while (query_count > 0) {
+
+        int work_size = min(query_count, WARP_SIZE);
+
+        // try inserting query to set and check whether the query is new
+        int found_new_query = 0;
+        int query_idx = -1;
+        if (warp_thread_idx < work_size) {
+          query_idx = query_sorted_idxes[batch_idx__hash_f_idx * num_query + query_offset + warp_thread_idx];
+          int slot = set_insert<int>(hashtable_query, hashtable_size, query_idx);
+          if (slot >= 0) {
+            found_new_query = atomicAdd(&hashtable_count[slot], 1) == 0;
+          }
+        }
+
+        // compute cumulative offset
+        int position_offset = found_new_query;
+        int next_position_offset = 0;
+        #pragma unroll
+        for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) {
+          next_position_offset = __shfl_up_sync(FULL_MASK, position_offset, offset);
+          if (thread_id % WARP_SIZE >= offset) {
+            position_offset = position_offset + next_position_offset;
+          }
+        }
+
+        // get the inserted query list end index
+        int inserted_query_base = 0;
+        if (thread_id % WARP_SIZE == WARP_SIZE - 1) {
+          inserted_query_base = atomicAdd(query_counter, position_offset);
+        }
+        inserted_query_base = __shfl_sync(FULL_MASK, inserted_query_base, WARP_SIZE - 1);
+
+        // insert new queries to list
+        int insert_idx = inserted_query_base + position_offset - 1;
+        if (found_new_query) {
+          inserted_query[insert_idx] = query_idx;
+        }
+
+        // remove inserted queries from list
+        query_offset_buffer[hash_f_idx] += work_size;
+        query_count_buffer[hash_f_idx] -= work_size;
+        query_offset += work_size;
+        query_count -= work_size;
+
+        // if list is almost full, stop inserting
+        if (inserted_query_base + OPTIMAL_THREADS_PER_BLOCK > hashtable_size) {
+          stop_flag = 1;
+          break;
+        }
+
+      }
+
+      if (stop_flag) {
+        break;
+      }
+
+      hash_f_idx_base = hash_f_idx_base + num_warps;
+
+    }
+
+    __syncthreads();
+
+    int num_distinct_query = query_counter[0];
+
+    if (num_distinct_query > 0) {
+      for (int idx_base = 0; idx_base < num_distinct_query; idx_base = idx_base + num_warps) {
+        int idx = idx_base + warp_idx;
+        if (idx < num_distinct_query) {
+          int query_idx = inserted_query[idx];
+          int batch_idx__query_idx = batch_idx * num_query + query_idx;
+
+          int slot = set_lookup<int>(hashtable_query, hashtable_size, query_idx);
+          int duplicate_count = hashtable_count[slot];
+
+          float weight = 0;
+          for (int weight_idx_base = 0; weight_idx_base < weight_dim; weight_idx_base = weight_idx_base + WARP_SIZE) {
+            int weight_dim_idx = weight_idx_base + warp_thread_idx;
+            float val = weight_buffer[weight_dim_idx] * query_weight[batch_idx__query_idx * weight_dim + weight_dim_idx];
+            #pragma unroll
+            for (int offset = 1; offset < WARP_SIZE; offset = offset << 1) {
+              val += __shfl_xor_sync(FULL_MASK, val, offset);
+            }
+            weight = weight + val;
+          }
+
+          weight = (float)duplicate_count * weight / float(num_hash_f);
+
+          for (int value_idx_base = 0; value_idx_base < value_dim; value_idx_base = value_idx_base + WARP_SIZE) {
+            int value_dim_idx = value_idx_base + warp_thread_idx;
+            float val = value_buffer[value_dim_idx];
+            atomicAdd(&cumulation_value[batch_idx__query_idx * value_dim + value_dim_idx], weight * val);
+          }
+        }
+      }
+    } else {
+
+      // all computation is completed if num_distinct_query == 0
+      break;
+
+    }
+
+    __syncthreads();
+
+  }
+
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_cuda.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2adc0f735358d0fcb6a056e7d19ba745977e129
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_cuda.h
@@ -0,0 +1,157 @@
+__global__ void fast_hash_ver1_cuda_kernel(
+  int *mask,        // [batch_size, num_vector]
+  float *vector,    // [batch_size, num_vector, vector_dim]
+  int *Dmat,        // [3, num_part, vector_dim]
+  int *hash_code,   // [batch_size, num_vector, num_hash_f]
+  int batch_size,
+  int num_vector,
+  int vector_dim,
+  int num_part,
+  int num_hash_f,
+  int hash_code_len
+);
+
+__global__ void lsh_cumulation_ver1_step1_cuda_kernel(
+  int *key_mask,           // [batch_size, num_key]
+  int *key_hash_code,      // [batch_size, num_key, num_hash_f]
+  float *value,            // [batch_size, num_key, value_dim]
+  float *hashtable_value,  // [batch_size, num_hash_f, hashtable_capacity, value_dim]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity,
+  int num_key,
+  int value_dim,
+  int offset_warp
+);
+
+__global__ void lsh_cumulation_ver1_step2_cuda_kernel(
+  int *query_mask,         // [batch_size, num_query]
+  int *query_hash_code,    // [batch_size, num_query, num_hash_f]
+  float *hashtable_value,  // [batch_size, num_hash_f, hashtable_capacity, value_dim]
+  float *cumulation_value, // [batch_size, num_query, value_dim]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity,
+  int num_query,
+  int value_dim,
+  int offset_warp
+);
+
+__global__ void lsh_weighted_cumulation_ver1_step1_cuda_kernel(
+  int *key_mask,            // [batch_size, num_key]
+  int *key_hash_code,       // [batch_size, num_key, num_hash_f]
+  float *key_weight,        // [batch_size, num_key, weight_dim]
+  float *value,             // [batch_size, num_key, value_dim]
+  float *hashtable_value,   // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity,
+  int num_key,
+  int value_dim,
+  int weight_dim,
+  int offset_warp,
+  int weight_idx
+);
+
+__global__ void lsh_weighted_cumulation_ver1_step2_cuda_kernel(
+  int *query_mask,          // [batch_size, num_query]
+  int *query_hash_code,     // [batch_size, num_query, num_hash_f]
+  float *query_weight,      // [batch_size, num_query, weight_dim]
+  float *hashtable_value,   // [batch_size, num_hash_f, hashtable_capacity, WARP_SIZE]
+  float *cumulation_value,  // [batch_size, num_query, value_dim]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity,
+  int num_query,
+  int value_dim,
+  int weight_dim,
+  int offset_warp,
+  int weight_idx
+);
+
+__global__ void count_sort_step1_cuda_kernel(
+  int *key_mask,         // [batch_size, num_key]
+  int *key_hash_code,    // [batch_size, num_key, num_hash_f]
+  int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity,
+  int num_key
+);
+
+__global__ void count_sort_step2_cuda_kernel(
+  int *count_sort_table,  // [batch_size, num_hash_f, hashtable_capacity]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity
+);
+
+__global__ void count_sort_step3_cuda_kernel(
+  int *key_mask,          // [batch_size, num_key]
+  int *key_hash_code,     // [batch_size, num_key, num_hash_f]
+  int *count_sort_table,  // [batch_size, num_hash_f, hashtable_capacity]
+  int *key_sorted_idxes,  // [batch_size, num_hash_f, num_key]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity,
+  int num_key
+);
+
+__global__ void extract_query_info_cuda_kernel(
+  int *query_mask,       // [batch_size, num_query]
+  int *query_hash_code,  // [batch_size, num_query, num_hash_f]
+  int *count_sort_table, // [batch_size, num_hash_f, hashtable_capacity]
+  int *query_info,       // [batch_size, num_query, 2, num_hash_f]
+  int batch_size,
+  int num_hash_f,
+  int hashtable_capacity,
+  int num_query
+);
+
+__global__ void lsh_weighted_cumulation_ver2_step2_cuda_kernel(
+  int *query_mask,         // [batch_size, num_query]
+  int *query_info,         // [batch_size, num_query, 2, num_hash_f]
+  int *key_sorted_idxes,   // [batch_size, num_hash_f, num_key]
+  float *query_weight,     // [batch_size, num_query, weight_dim]
+  float *key_weight,       // [batch_size, num_key, weight_dim]
+  float *value,            // [batch_size, num_key, value_dim]
+  float *cumulation_value, // [batch_size, num_query, value_dim]
+  int batch_size,
+  int num_hash_f,
+  int num_query,
+  int num_key,
+  int value_dim,
+  int weight_dim
+);
+
+__global__ void lsh_weighted_cumulation_ver3_step2_cuda_kernel(
+  int *query_sorted_idxes,   // [batch_size, num_hash_f, num_query]
+  int *key_mask,             // [batch_size, num_key]
+  int *key_info,             // [batch_size, num_key, 2, num_hash_f]
+  float *query_weight,       // [batch_size, num_query, weight_dim]
+  float *key_weight,         // [batch_size, num_key, weight_dim]
+  float *value,              // [batch_size, num_key, value_dim]
+  float *cumulation_value,   // [batch_size, num_query, value_dim]
+  int batch_size,
+  int num_hash_f,
+  int num_query,
+  int num_key,
+  int value_dim,
+  int weight_dim
+);
+
+__global__ void lsh_weighted_cumulation_ver4_step2_cuda_kernel(
+  int *query_sorted_idxes,   // [batch_size, num_hash_f, num_query]
+  int *key_mask,             // [batch_size, num_key]
+  int *key_info,             // [batch_size, num_key, 2, num_hash_f]
+  float *query_weight,       // [batch_size, num_query, weight_dim]
+  float *key_weight,         // [batch_size, num_key, weight_dim]
+  float *value,              // [batch_size, num_key, value_dim]
+  float *cumulation_value,   // [batch_size, num_query, value_dim]
+  int batch_size,
+  int num_hash_f,
+  int num_query,
+  int num_key,
+  int value_dim,
+  int weight_dim
+);
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_torch.cpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_torch.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e150a2be604b28f600ab345a8cc9e97819cca416
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/kernels/yoso/fast_lsh_cumulation_torch.cpp
@@ -0,0 +1,128 @@
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include "fast_lsh_cumulation.h"
+#include "common_cuda.h"
+#include <vector>
+
+std::vector<at::Tensor> fast_hash(
+  at::Tensor query_mask,
+  at::Tensor query_vector,
+  at::Tensor key_mask,
+  at::Tensor key_vector,
+  int num_hash_f,
+  int hash_code_len,
+  bool use_cuda,
+  int version
+) {
+  return fast_hash_ver1_kernel(
+    query_mask,
+    query_vector,
+    key_mask,
+    key_vector,
+    num_hash_f,
+    hash_code_len,
+    use_cuda
+  );
+}
+
+at::Tensor lsh_cumulation(
+  at::Tensor query_mask,         // [batch_size, num_query]
+  at::Tensor query_hash_code,    // [batch_size, num_query, num_hash_f]
+  at::Tensor key_mask,           // [batch_size, num_key]
+  at::Tensor key_hash_code,      // [batch_size, num_key, num_hash_f]
+  at::Tensor value,              // [batch_size, num_key, value_dim]
+  int hashtable_capacity,
+  bool use_cuda,
+  int version
+) {
+  return lsh_cumulation_ver1_kernel(
+    query_mask,
+    query_hash_code,
+    key_mask,
+    key_hash_code,
+    value,
+    hashtable_capacity,
+    use_cuda
+  );
+}
+
+at::Tensor lsh_weighted_cumulation(
+  at::Tensor query_mask,         // [batch_size, num_query]
+  at::Tensor query_hash_code,    // [batch_size, num_query, num_hash_f]
+  at::Tensor query_weight,       // [batch_size, num_query, weight_dim]
+  at::Tensor key_mask,           // [batch_size, num_key]
+  at::Tensor key_hash_code,      // [batch_size, num_key, num_hash_f]
+  at::Tensor key_weight,         // [batch_size, num_key, weight_dim]
+  at::Tensor value,              // [batch_size, num_key, value_dim]
+  int hashtable_capacity,
+  bool use_cuda,
+  int version
+) {
+  if (version == 1) {
+    return lsh_weighted_cumulation_ver1_kernel(
+      query_mask,
+      query_hash_code,
+      query_weight,
+      key_mask,
+      key_hash_code,
+      key_weight,
+      value,
+      hashtable_capacity,
+      use_cuda
+    );
+  } else if (version == 2) {
+    return lsh_weighted_cumulation_ver2_kernel(
+      query_mask,
+      query_hash_code,
+      query_weight,
+      key_mask,
+      key_hash_code,
+      key_weight,
+      value,
+      hashtable_capacity,
+      use_cuda
+    );
+  } else if (version == 3) {
+    return lsh_weighted_cumulation_ver3_kernel(
+      query_mask,
+      query_hash_code,
+      query_weight,
+      key_mask,
+      key_hash_code,
+      key_weight,
+      value,
+      hashtable_capacity,
+      use_cuda
+    );
+  } else if (version == 4) {
+    return lsh_weighted_cumulation_ver4_kernel(
+      query_mask,
+      query_hash_code,
+      query_weight,
+      key_mask,
+      key_hash_code,
+      key_weight,
+      value,
+      hashtable_capacity,
+      use_cuda
+    );
+  } else {
+    return lsh_weighted_cumulation_ver3_kernel(
+      query_mask,
+      query_hash_code,
+      query_weight,
+      key_mask,
+      key_hash_code,
+      key_weight,
+      value,
+      hashtable_capacity,
+      use_cuda
+    );
+  }
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("fast_hash", &fast_hash, "Fast Hash (CUDA)");
+  m.def("lsh_cumulation", &lsh_cumulation, "LSH Cumulation (CUDA)");
+  m.def("lsh_weighted_cumulation", &lsh_weighted_cumulation, "LSH Weighted Cumulation (CUDA)");
+}
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73b50ac619b49737ba5c95e14df269fce200a3c7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_albert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_albert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee31bc357bf2c18699de9a156989e6d0f8448161
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_albert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_tf_albert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_tf_albert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29ddb3093bf39cbd46f8bfdac0cec96d659066da
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_tf_albert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/tokenization_albert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/tokenization_albert_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b96a9fac5e2e6947334d3ce1fe4340ccc742e619
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/tokenization_albert_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c958dad1b9300b27f5b5f67e16fad9041e3db869
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/configuration_bart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/configuration_bart.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee38568d1d9a571684f6287e6ff130232f75d175
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/configuration_bart.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_bart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_bart.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..222951d5911c1fde84b3ebf6e517cc22c31a318b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_bart.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_flax_bart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_flax_bart.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e17105fc8b2ebf45b3a2a65bf61df362635f77f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_flax_bart.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_tf_bart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_tf_bart.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..079f9081369b373d69f2e2a99511bd2110af906f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/modeling_tf_bart.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/tokenization_bart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/tokenization_bart.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95d9040f2d466a89c87866df5a9740a03471df4f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/tokenization_bart.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/tokenization_bart_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/tokenization_bart_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5adb5739acdecf033bccf0788fdaf6598d97c57
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bart/__pycache__/tokenization_bart_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bartpho/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bartpho/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6c0673b523a50279987dd33aa5c864ba82461bd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bartpho/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bartpho/__pycache__/tokenization_bartpho.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bartpho/__pycache__/tokenization_bartpho.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44d91bec4925c7b5c67d01af81ff13a31eca51bb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bartpho/__pycache__/tokenization_bartpho.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b18d297ebfcca692e60df4bbb71d18e4074bac6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_japanese/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_japanese/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36ccd2340c71300c8c4e21dc8e43222fec3121b6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_japanese/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_japanese/__pycache__/tokenization_bert_japanese.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_japanese/__pycache__/tokenization_bert_japanese.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5a77eabd82e0107495108e2d7f90a8c3424623a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_japanese/__pycache__/tokenization_bert_japanese.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bertweet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bertweet/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d0710570ddcfa74fb4ba04508b19e2c1f1d19ec
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bertweet/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bertweet/__pycache__/tokenization_bertweet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bertweet/__pycache__/tokenization_bertweet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f2c1e4aa581ab3e7ac9c731df7ba80b20d203ad
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bertweet/__pycache__/tokenization_bertweet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe9dc464abea80063f5bdc3ccac46de2c1be744c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/configuration_big_bird.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/configuration_big_bird.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0967de343730870230a3c5394481706fd387bce
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/configuration_big_bird.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/tokenization_big_bird.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/tokenization_big_bird.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f414bbc30fbee4d561ce4c0d408bf8451ae1c4b6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/tokenization_big_bird.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/tokenization_big_bird_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/tokenization_big_bird_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c242002eb9ff6f06db8451cff6fc7caeb3ad8f63
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__pycache__/tokenization_big_bird_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d0640b0a2fd331c69de4f802f076af4a1bccc6d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__pycache__/configuration_bigbird_pegasus.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__pycache__/configuration_bigbird_pegasus.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f049b8909d9149b85b7ef778ff69818a128965b3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bigbird_pegasus/__pycache__/configuration_bigbird_pegasus.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd2a9b0d939082c8c9831734a47115d23686386a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ce1330306da65e924f63f6dbad8477628609a3e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/configuration_blip_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/configuration_blip_2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f90b8cd66599a2fb03e11f19065b229ef9ddce10
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/configuration_blip_2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/processing_blip_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/processing_blip_2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0473f22e8c5b04cbf74dc8e03671b9c7b99550c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip_2/__pycache__/processing_blip_2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ca84a320fdc4aa1f4cf99aef78154849514c8a6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_bridgetower import *
+    from .image_processing_bridgetower import *
+    from .image_processing_bridgetower_fast import *
+    from .modeling_bridgetower import *
+    from .processing_bridgetower import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/configuration_bridgetower.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/configuration_bridgetower.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c84b0a294dafb35c991654c6e7df79c3fe58452
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/configuration_bridgetower.py
@@ -0,0 +1,308 @@
+# coding=utf-8
+# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License=, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing=, software
+# distributed under the License is distributed on an "AS IS" BASIS=,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BridgeTower model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class BridgeTowerVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the vision configuration of a [`BridgeTowerModel`]. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the bridgetower-base
+    [BridgeTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in visual encoder model.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 288):
+            The size (resolution) of each image.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        stop_gradient (`bool`, *optional*, defaults to `False`):
+            Whether to stop gradient for training.
+        share_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether LayerNorm layers are shared.
+        remove_last_layer (`bool`, *optional*, defaults to `False`):
+            Whether to remove the last layer from the vision encoder.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import BridgeTowerVisionConfig
+
+    >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration for the vision model
+    >>> configuration = BridgeTowerVisionConfig()
+
+    >>> # Accessing the configuration
+    >>> configuration
+    ```"""
+
+    model_type = "bridgetower_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_channels=3,
+        patch_size=16,
+        image_size=288,
+        initializer_factor=1,
+        layer_norm_eps=1e-05,
+        stop_gradient=False,
+        share_layernorm=True,
+        remove_last_layer=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_factor = initializer_factor
+        self.layer_norm_eps = layer_norm_eps
+        self.stop_gradient = stop_gradient
+        self.share_layernorm = share_layernorm
+        self.remove_last_layer = remove_last_layer
+
+
+class BridgeTowerTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the text configuration of a [`BridgeTowerModel`]. The default values here
+    are copied from RoBERTa. Instantiating a configuration with the defaults will yield a similar configuration to that
+    of the bridgetower-base [BridegTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the text part of the model. Defines the number of different tokens that can be
+            represented by the `inputs_ids` passed when calling [`BridgeTowerModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 514):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids`.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+
+    Example:
+
+    ```python
+    >>> from transformers import BridgeTowerTextConfig
+
+    >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration for the text model
+    >>> configuration = BridgeTowerTextConfig()
+
+    >>> # Accessing the configuration
+    >>> configuration
+    ```"""
+
+    model_type = "bridgetower_text_model"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        initializer_factor=1,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+        layer_norm_eps=1e-05,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_factor = initializer_factor
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+
+class BridgeTowerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BridgeTowerModel`]. It is used to instantiate a
+    BridgeTower model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the bridgetower-base
+    [BridgeTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        share_cross_modal_transformer_layers (`bool`, *optional*, defaults to `True`):
+            Whether cross modal transformer layers are shared.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        share_link_tower_layers (`bool`, *optional*, defaults to `False`):
+            Whether the bride/link tower layers are shared.
+        link_tower_type (`str`, *optional*, defaults to `"add"`):
+            Type of the bridge/link layer.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie input and output embeddings.
+        init_layernorm_from_vision_encoder (`bool`, *optional*, defaults to `False`):
+            Whether to init LayerNorm from the vision encoder.
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`BridgeTowerTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`BridgeTowerVisionConfig`].
+
+    Example:
+
+    ```python
+    >>> from transformers import BridgeTowerModel, BridgeTowerConfig
+
+    >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration
+    >>> configuration = BridgeTowerConfig()
+
+    >>> # Initializing a model from the BridgeTower/bridgetower-base style configuration
+    >>> model = BridgeTowerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "bridgetower"
+    sub_configs = {"text_config": BridgeTowerTextConfig, "vision_config": BridgeTowerVisionConfig}
+
+    def __init__(
+        self,
+        share_cross_modal_transformer_layers=True,
+        hidden_act="gelu",
+        hidden_size=768,
+        initializer_factor=1,
+        layer_norm_eps=1e-05,
+        share_link_tower_layers=False,
+        link_tower_type="add",
+        num_attention_heads=12,
+        num_hidden_layers=6,
+        tie_word_embeddings=False,
+        init_layernorm_from_vision_encoder=False,
+        text_config=None,
+        vision_config=None,
+        **kwargs,
+    ):
+        # TODO: remove this once the Hub files are updated.
+        _ = kwargs.pop("text_config_dict", None)
+        _ = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+        self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.initializer_factor = initializer_factor
+        self.layer_norm_eps = layer_norm_eps
+        self.share_link_tower_layers = share_link_tower_layers
+        self.link_tower_type = link_tower_type
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.tie_word_embeddings = tie_word_embeddings
+        self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `BridgeTowerTextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. Initializing the `BridgeTowerVisionConfig` with default values.")
+
+        self.text_config = BridgeTowerTextConfig(**text_config)
+        self.vision_config = BridgeTowerVisionConfig(**vision_config)
+
+
+__all__ = ["BridgeTowerConfig", "BridgeTowerTextConfig", "BridgeTowerVisionConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/image_processing_bridgetower.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/image_processing_bridgetower.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb39ed0975610f65f2b44725ffbbbb43a0c18f3f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -0,0 +1,541 @@
+# coding=utf-8
+# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for BridgeTower."""
+
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import PaddingMode, center_crop, pad, resize, to_channel_dimension_format
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+
+
+if is_vision_available():
+    import PIL
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.vilt.image_processing_vilt.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> list[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+# Copied from transformers.models.vilt.image_processing_vilt.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+# Copied from transformers.models.vilt.image_processing_vilt.get_max_height_width
+def get_max_height_width(
+    images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> list[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
+
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+
+
+# Copied from transformers.models.vilt.image_processing_vilt.get_resize_output_image_size
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    shorter: int = 800,
+    longer: int = 1333,
+    size_divisor: int = 32,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple[int, int]:
+    input_height, input_width = get_image_size(input_image, input_data_format)
+    min_size, max_size = shorter, longer
+
+    scale = min_size / min(input_height, input_width)
+
+    if input_height < input_width:
+        new_height = min_size
+        new_width = scale * input_width
+    else:
+        new_height = scale * input_height
+        new_width = min_size
+
+    if max(new_height, new_width) > max_size:
+        scale = max_size / max(new_height, new_width)
+        new_height = scale * new_height
+        new_width = scale * new_width
+
+    new_height, new_width = int(new_height + 0.5), int(new_width + 0.5)
+    new_height = new_height // size_divisor * size_divisor
+    new_width = new_width // size_divisor * size_divisor
+
+    return new_height, new_width
+
+
+class BridgeTowerImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a BridgeTower image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{'shortest_edge': 288}`):
+            Resize the shorter side of the input to `size["shortest_edge"]`. The longer side will be limited to under
+            `int((1333 / 800) * size["shortest_edge"])` while preserving the aspect ratio. Only has an effect if
+            `do_resize` is set to `True`. Can be overridden by the `size` parameter in the `preprocess` method.
+        size_divisor (`int`, *optional*, defaults to 32):
+            The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize`
+            is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image. Can be overridden by the `do_center_crop` parameter in the `preprocess`
+            method.
+        crop_size (`dict[str, int]`, *optional*):
+            Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`.
+            Can be overridden by the `crop_size` parameter in the `preprocess` method. If unset defaults to `size`,
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether to pad the image to the `(max_height, max_width)` of the images in the batch. Can be overridden by
+            the `do_pad` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        size_divisor: int = 32,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_center_crop: bool = True,
+        crop_size: Optional[dict[str, int]] = None,
+        do_pad: bool = True,
+        **kwargs,
+    ) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 288}
+        size = get_size_dict(size, default_to_square=False)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.size_divisor = size_divisor
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_pad = do_pad
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+
+    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        size_divisor: int = 32,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the
+        longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then
+        resized to the max size while preserving the aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
+            size_divisor (`int`, *optional*, defaults to 32):
+                The image is resized to a size that is a multiple of this value.
+            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" not in size:
+            raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}")
+        shorter = size["shortest_edge"]
+        longer = int(1333 / 800 * shorter)
+        output_size = get_resize_output_image_size(
+            image, shorter=shorter, longer=longer, size_divisor=size_divisor, input_data_format=input_data_format
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def center_crop(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
+        any edge, the image is padded with 0's and then center cropped.
+
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            size (`dict[str, int]`):
+                Size of the output image in the form `{"height": h, "width": w}`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred from the input
+                image.
+        """
+        output_size = size["shortest_edge"]
+        return center_crop(
+            image,
+            size=(output_size, output_size),
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: tuple[int, int],
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        return padded_image
+
+    # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad
+    def pad(
+        self,
+        images: list[np.ndarray],
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        pad_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        padded_images = [
+            self._pad_image(
+                image,
+                pad_size,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image in images
+        ]
+        data = {"pixel_values": padded_images}
+
+        if return_pixel_mask:
+            masks = [
+                make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format)
+                for image in images
+            ]
+            data["pixel_mask"] = masks
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        size_divisor: Optional[int] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_pad: Optional[bool] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[dict[str, int]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the image after `resize`. The shortest edge of the image is resized to
+                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
+                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            size_divisor (`int`, *optional*, defaults to `self.size_divisor`):
+                The image is resized to a size that is a multiple of this value.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the image by if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the image to the (max_height, max_width) in the batch. If `True`, a pixel mask is also
+                created and returned.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the
+                image is padded with 0's and then center cropped.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the image after center crop. If one edge the image is smaller than `crop_size`, it will be
+                padded with zeros and then cropped
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size_divisor = size_divisor if size_divisor is not None else self.size_divisor
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        # For backwards compatibility. Initial version of this processor was cropping to the "size" argument, which
+        # it should default to if crop_size is undefined.
+        crop_size = (
+            crop_size if crop_size is not None else (self.crop_size if self.crop_size is not None else self.size)
+        )
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        images = self.fetch_images(images)
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        # Here, crop_size is used only if it is set, else size will be used.
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if do_resize:
+            images = [
+                self.resize(
+                    image=image,
+                    size=size,
+                    size_divisor=size_divisor,
+                    resample=resample,
+                    input_data_format=input_data_format,
+                )
+                for image in images
+            ]
+
+        if do_center_crop:
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        if do_pad:
+            encoded_outputs = self.pad(
+                images, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=data_format
+            )
+        else:
+            encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+
+        return encoded_outputs
+
+
+__all__ = ["BridgeTowerImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/image_processing_bridgetower_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/image_processing_bridgetower_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..5be6f9f6c54b7bf6e973b9102179b63cbfe353d8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/image_processing_bridgetower_fast.py
@@ -0,0 +1,280 @@
+# coding=utf-8
+# Copyright 2025 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for BridgeTower."""
+
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    BatchFeature,
+    DefaultFastImageProcessorKwargs,
+    ImageInput,
+    SizeDict,
+    TensorType,
+    Unpack,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling
+from ...utils import auto_docstring
+
+
+def make_pixel_mask(
+    image: "torch.Tensor",
+    output_size: tuple[int, int],
+) -> "torch.Tensor":
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = image.shape[-2:]
+    batch_size = image.size(0)
+    mask = torch.zeros((batch_size, *output_size), dtype=torch.long)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+def get_resize_output_image_size(
+    input_image: "torch.Tensor",
+    shorter: int = 800,
+    longer: int = 1333,
+    size_divisor: int = 32,
+) -> tuple[int, int]:
+    input_height, input_width = input_image.shape[-2:]
+    min_size, max_size = shorter, longer
+
+    scale = min_size / min(input_height, input_width)
+
+    if input_height < input_width:
+        new_height = min_size
+        new_width = scale * input_width
+    else:
+        new_height = scale * input_height
+        new_width = min_size
+
+    if max(new_height, new_width) > max_size:
+        scale = max_size / max(new_height, new_width)
+        new_height = scale * new_height
+        new_width = scale * new_width
+
+    new_height, new_width = int(new_height + 0.5), int(new_width + 0.5)
+    new_height = new_height // size_divisor * size_divisor
+    new_width = new_width // size_divisor * size_divisor
+
+    return new_height, new_width
+
+
+class BridgeTowerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    Args:
+        size_divisor (`int`, *optional*, defaults to 32):
+            The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize`
+            is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method.
+    """
+
+    size_divisor: Optional[int]
+
+
+@auto_docstring
+class BridgeTowerImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"shortest_edge": 288}
+    default_to_square = False
+    crop_size = {"shortest_edge": 288}
+    do_resize = True
+    do_center_crop = True
+    do_rescale = True
+    do_normalize = True
+    do_pad = True
+    size_divisor = 32
+    valid_kwargs = BridgeTowerFastImageProcessorKwargs
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(self, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def resize(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+        size_divisor: int = 32,
+        interpolation: Optional["F.InterpolationMode"] = None,
+        antialias: bool = True,
+        **kwargs,
+    ) -> "torch.Tensor":
+        """
+        Resize an image.
+
+        Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the
+        longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then
+        resized to the max size while preserving the aspect ratio.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            size_divisor (`int`, *optional*, defaults to 32):
+                The image is resized to a size that is a multiple of this value.
+            resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
+
+        Returns:
+            `torch.Tensor`: The resized image.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        if not size.shortest_edge:
+            raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}")
+        shorter = size.shortest_edge
+        longer = int(1333 / 800 * shorter)
+        output_height, output_width = get_resize_output_image_size(
+            image,
+            shorter=shorter,
+            longer=longer,
+            size_divisor=size_divisor,
+        )
+        return super().resize(
+            image=image,
+            size=SizeDict(height=output_height, width=output_width),
+            interpolation=interpolation,
+            antialias=antialias,
+        )
+
+    def center_crop(
+        self,
+        image: "torch.Tensor",
+        size: dict[str, int],
+        **kwargs,
+    ) -> "torch.Tensor":
+        """
+        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
+        any edge, the image is padded with 0's and then center cropped.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to center crop.
+            size (`dict[str, int]`):
+                Size of the output image in the form `{"height": h, "width": w}`.
+        """
+        output_size = size.shortest_edge
+        return F.center_crop(
+            image,
+            output_size=(output_size, output_size),
+            **kwargs,
+        )
+
+    def _pad_image(
+        self,
+        image: "torch.Tensor",
+        output_size: tuple[int, int],
+        constant_values: Union[float, Iterable[float]] = 0,
+    ) -> "torch.Tensor":
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = image.shape[-2:]
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = (0, 0, pad_right, pad_bottom)
+        padded_image = F.pad(
+            image,
+            padding,
+            fill=constant_values,
+        )
+        return padded_image
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        size_divisor: Optional[int],
+        interpolation: Optional["F.InterpolationMode"],
+        do_pad: bool,
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(
+                    image=stacked_images, size=size, size_divisor=size_divisor, interpolation=interpolation
+                )
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+
+        data = {}
+        if do_pad:
+            processed_images, processed_masks = self.pad(
+                processed_images, return_mask=True, disable_grouping=disable_grouping
+            )
+            processed_masks = torch.stack(processed_masks, dim=0) if return_tensors else processed_masks
+            data["pixel_mask"] = processed_masks
+
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+        data["pixel_values"] = processed_images
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def to_dict(self):
+        encoder_dict = super().to_dict()
+        encoder_dict.pop("_valid_processor_keys", None)
+        encoder_dict.pop("crop_size", None)
+        return encoder_dict
+
+
+__all__ = ["BridgeTowerImageProcessorFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db2957f2ef98ff23397c91c9eadb93795eb22a9f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/configuration_bros.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/configuration_bros.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa9e6d5efddbd435936c37948ecd0930426c4131
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/configuration_bros.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/modeling_bros.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/modeling_bros.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6edfae2e587252f3bd2122f985ccd9b9d9d659cf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/modeling_bros.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/processing_bros.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/processing_bros.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d9470bfa7a80a1f2ddca41d8b82bb0957b8d462
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bros/__pycache__/processing_bros.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d90f64de97f78ccaf1592c3c5cad40a9b2d5dcb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_camembert import *
+    from .modeling_camembert import *
+    from .modeling_tf_camembert import *
+    from .tokenization_camembert import *
+    from .tokenization_camembert_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/configuration_camembert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/configuration_camembert.py
new file mode 100644
index 0000000000000000000000000000000000000000..3979e54874439aa41feb6abe3815df5c7a997419
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/configuration_camembert.py
@@ -0,0 +1,155 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""CamemBERT configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class CamembertConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`CamembertModel`] or a [`TFCamembertModel`]. It is
+    used to instantiate a Camembert model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Camembert
+    [almanach/camembert-base](https://huggingface.co/almanach/camembert-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`CamembertModel`] or [`TFCamembertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`CamembertModel`] or [`TFCamembertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Example:
+
+    ```python
+    >>> from transformers import CamembertConfig, CamembertModel
+
+    >>> # Initializing a Camembert almanach/camembert-base style configuration
+    >>> configuration = CamembertConfig()
+
+    >>> # Initializing a model (with random weights) from the almanach/camembert-base style configuration
+    >>> model = CamembertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "camembert"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+class CamembertOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["CamembertConfig", "CamembertOnnxConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/modeling_camembert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/modeling_camembert.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a07402f739a603ad3b8a50a197b7784bfc1d2a7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/modeling_camembert.py
@@ -0,0 +1,1576 @@
+# coding=utf-8
+# Copyright 2019 Inria, Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch CamemBERT model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN, gelu
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import auto_docstring, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_camembert import CamembertConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Camembert
+class CamembertEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->Camembert
+class CamembertSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+        self.layer_idx = layer_idx
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = self.query(hidden_states)
+        query_layer = query_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
+            1, 2
+        )
+
+        is_updated = False
+        is_cross_attention = encoder_hidden_states is not None
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_layer from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_layer = curr_past_key_value.layers[self.layer_idx].keys
+            value_layer = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_layer = self.key(current_states)
+            key_layer = key_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
+                1, 2
+            )
+            value_layer = self.value(current_states)
+            value_layer = value_layer.view(
+                batch_size, -1, self.num_attention_heads, self.attention_head_size
+            ).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_layer to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_layer, value_layer = curr_past_key_value.update(
+                    key_layer, value_layer, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if past_key_values is not None:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in CamembertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSdpaSelfAttention with Roberta->Camembert
+class CamembertSdpaSelfAttention(CamembertSelfAttention):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__(config, position_embedding_type=position_embedding_type, layer_idx=layer_idx)
+        self.dropout_prob = config.attention_probs_dropout_prob
+
+    # Adapted from CamembertSelfAttention
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.
+            logger.warning_once(
+                "CamembertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
+                "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to "
+                "the manual attention implementation, but specifying the manual implementation will be required from "
+                "Transformers version v5.0.0 onwards. This warning can be removed using the argument "
+                '`attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                past_key_values,
+                output_attentions,
+                cache_position,
+            )
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        query_layer = (
+            self.query(hidden_states).view(bsz, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
+        )
+
+        is_updated = False
+        is_cross_attention = encoder_hidden_states is not None
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_layer = curr_past_key_value.layers[self.layer_idx].keys
+            value_layer = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_layer = (
+                self.key(current_states)
+                .view(bsz, -1, self.num_attention_heads, self.attention_head_size)
+                .transpose(1, 2)
+            )
+            value_layer = (
+                self.value(current_states)
+                .view(bsz, -1, self.num_attention_heads, self.attention_head_size)
+                .transpose(1, 2)
+            )
+
+            if past_key_values is not None:
+                # save all key/value_layer to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_layer, value_layer = curr_past_key_value.update(
+                    key_layer, value_layer, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create
+        # a causal mask in case tgt_len == 1.
+        is_causal = self.is_decoder and not is_cross_attention and attention_mask is None and tgt_len > 1
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout_prob if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size)
+
+        return attn_output, None
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput with Roberta->Camembert
+class CamembertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+CAMEMBERT_SELF_ATTENTION_CLASSES = {
+    "eager": CamembertSelfAttention,
+    "sdpa": CamembertSdpaSelfAttention,
+}
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->Camembert,ROBERTA->CAMEMBERT
+class CamembertAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__()
+        self.self = CAMEMBERT_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config,
+            position_embedding_type=position_embedding_type,
+            layer_idx=layer_idx,
+        )
+        self.output = CamembertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Roberta->Camembert
+class CamembertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Roberta->Camembert
+class CamembertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaLayer with Roberta->Camembert
+class CamembertLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = CamembertAttention(config, layer_idx=layer_idx)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = CamembertAttention(config, position_embedding_type="absolute", layer_idx=layer_idx)
+        self.intermediate = CamembertIntermediate(config)
+        self.output = CamembertOutput(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask=encoder_attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEncoder with Roberta->Camembert
+class CamembertEncoder(nn.Module):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([CamembertLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and self.config.is_decoder and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+
+        if use_cache and self.config.is_decoder and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    past_key_values,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class CamembertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@auto_docstring
+class CamembertPreTrainedModel(PreTrainedModel):
+    config: CamembertConfig
+    base_model_prefix = "roberta"
+    supports_gradient_checkpointing = True
+    _supports_sdpa = True
+
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->CamembertLMHead
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, CamembertLMHead):
+            module.bias.data.zero_()
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->Camembert
+class CamembertClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->Camembert
+class CamembertLMHead(nn.Module):
+    """Camembert Head for masked language modeling."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.decoder.bias = self.bias
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x)
+
+        return x
+
+    def _tie_weights(self):
+        # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+        # For accelerate compatibility and to not break backward compatibility
+        if self.decoder.bias.device.type == "meta":
+            self.decoder.bias = self.bias
+        else:
+            self.bias = self.decoder.bias
+
+
+@auto_docstring
+class CamembertModel(CamembertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
+    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
+    `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
+
+    """
+
+    _no_split_modules = []
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.__init__ with Roberta->Camembert
+    def __init__(self, config, add_pooling_layer=True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = CamembertEmbeddings(config)
+        self.encoder = CamembertEncoder(config)
+
+        self.pooler = CamembertPooler(config) if add_pooling_layer else None
+
+        self.attn_implementation = config._attn_implementation
+        self.position_embedding_type = config.position_embedding_type
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.forward
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = (
+                past_key_values[0][0].shape[-2]
+                if not isinstance(past_key_values, Cache)
+                else past_key_values.get_seq_length()
+            )
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device)
+
+        use_sdpa_attention_masks = (
+            self.attn_implementation == "sdpa"
+            and self.position_embedding_type == "absolute"
+            and head_mask is None
+            and not output_attentions
+        )
+
+        # Expand the attention mask
+        if use_sdpa_attention_masks and attention_mask.dim() == 2:
+            # Expand the attention mask for SDPA.
+            # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+            if self.config.is_decoder:
+                extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                    attention_mask,
+                    input_shape,
+                    embedding_output,
+                    past_key_values_length,
+                )
+            else:
+                extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    attention_mask, embedding_output.dtype, tgt_len=seq_length
+                )
+        else:
+            # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+            # ourselves in which case we just need to make it broadcastable to all heads.
+            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+
+            if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2:
+                # Expand the attention mask for SDPA.
+                # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+                encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length
+                )
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@auto_docstring
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT
+class CamembertForMaskedLM(CamembertPreTrainedModel):
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `CamembertForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.roberta = CamembertModel(config, add_pooling_layer=False)
+        self.lm_head = CamembertLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->Camembert, ROBERTA->CAMEMBERT
+class CamembertForSequenceClassification(CamembertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.roberta = CamembertModel(config, add_pooling_layer=False)
+        self.classifier = CamembertClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->Camembert, ROBERTA->CAMEMBERT
+class CamembertForMultipleChoice(CamembertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.roberta = CamembertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.roberta(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(reshaped_logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->Camembert, ROBERTA->CAMEMBERT
+class CamembertForTokenClassification(CamembertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = CamembertModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->Camembert, ROBERTA->CAMEMBERT
+class CamembertForQuestionAnswering(CamembertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = CamembertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT, FacebookAI/roberta-base->almanach/camembert-base
+class CamembertForCausalLM(CamembertPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `CamembertLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.roberta = CamembertModel(config, add_pooling_layer=False)
+        self.lm_head = CamembertLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CamembertForCausalLM, AutoConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base")
+        >>> config = AutoConfig.from_pretrained("almanach/camembert-base")
+        >>> config.is_decoder = True
+        >>> model = CamembertForCausalLM.from_pretrained("almanach/camembert-base", config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
+            lm_loss = self.loss_function(
+                prediction_scores,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+__all__ = [
+    "CamembertForCausalLM",
+    "CamembertForMaskedLM",
+    "CamembertForMultipleChoice",
+    "CamembertForQuestionAnswering",
+    "CamembertForSequenceClassification",
+    "CamembertForTokenClassification",
+    "CamembertModel",
+    "CamembertPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/modeling_tf_camembert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/modeling_tf_camembert.py
new file mode 100644
index 0000000000000000000000000000000000000000..0869902aa96245ead1bb22f2e818517288a35534
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/modeling_tf_camembert.py
@@ -0,0 +1,1800 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 CamemBERT model."""
+
+from __future__ import annotations
+
+import math
+import warnings
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithPastAndCrossAttentions,
+    TFBaseModelOutputWithPoolingAndCrossAttentions,
+    TFCausalLMOutputWithCrossAttentions,
+    TFMaskedLMOutput,
+    TFMultipleChoiceModelOutput,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFMaskedLanguageModelingLoss,
+    TFModelInputType,
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_camembert import CamembertConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "almanach/camembert-base"
+_CONFIG_FOR_DOC = "CamembertConfig"
+
+
+CAMEMBERT_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`CamembertConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CAMEMBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings
+class TFCamembertEmbeddings(keras.layers.Layer):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.padding_idx = 1
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.max_position_embeddings = config.max_position_embeddings
+        self.initializer_range = config.initializer_range
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+    def build(self, input_shape=None):
+        with tf.name_scope("word_embeddings"):
+            self.weight = self.add_weight(
+                name="weight",
+                shape=[self.config.vocab_size, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("token_type_embeddings"):
+            self.token_type_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.config.type_vocab_size, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("position_embeddings"):
+            self.position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_position_embeddings, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+    def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
+        """
+        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
+        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
+
+        Args:
+            input_ids: tf.Tensor
+        Returns: tf.Tensor
+        """
+        mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)
+        incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask
+
+        return incremental_indices + self.padding_idx
+
+    def call(
+        self,
+        input_ids=None,
+        position_ids=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        past_key_values_length=0,
+        training=False,
+    ):
+        """
+        Applies embedding based on inputs tensor.
+
+        Returns:
+            final_embeddings (`tf.Tensor`): output embedding tensor.
+        """
+        assert not (input_ids is None and inputs_embeds is None)
+
+        if input_ids is not None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+        input_shape = shape_list(inputs_embeds)[:-1]
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = self.create_position_ids_from_input_ids(
+                    input_ids=input_ids, past_key_values_length=past_key_values_length
+                )
+            else:
+                position_ids = tf.expand_dims(
+                    tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
+                )
+
+        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
+        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
+        final_embeddings = inputs_embeds + position_embeds + token_type_embeds
+        final_embeddings = self.LayerNorm(inputs=final_embeddings)
+        final_embeddings = self.dropout(inputs=final_embeddings, training=training)
+
+        return final_embeddings
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Camembert
+class TFCamembertPooler(keras.layers.Layer):
+    def __init__(self, config: CamembertConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(inputs=first_token_tensor)
+
+        return pooled_output
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Camembert
+class TFCamembertSelfAttention(keras.layers.Layer):
+    def __init__(self, config: CamembertConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
+                f"of attention heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+        self.query = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
+
+        self.is_decoder = config.is_decoder
+        self.config = config
+
+    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor,
+        encoder_attention_mask: tf.Tensor,
+        past_key_value: tuple[tf.Tensor],
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        batch_size = shape_list(hidden_states)[0]
+        mixed_query_layer = self.query(inputs=hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size)
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
+            key_layer = tf.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = tf.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        # (batch size, num_heads, seq_len_q, seq_len_k)
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, dk)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in TFCamembertModel call() function)
+            attention_scores = tf.add(attention_scores, attention_mask)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(inputs=attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = tf.multiply(attention_probs, head_mask)
+
+        attention_output = tf.matmul(attention_probs, value_layer)
+        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
+
+        # (batch_size, seq_len_q, all_head_size)
+        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
+        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Camembert
+class TFCamembertSelfOutput(keras.layers.Layer):
+    def __init__(self, config: CamembertConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Camembert
+class TFCamembertAttention(keras.layers.Layer):
+    def __init__(self, config: CamembertConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.self_attention = TFCamembertSelfAttention(config, name="self")
+        self.dense_output = TFCamembertSelfOutput(config, name="output")
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(
+        self,
+        input_tensor: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor,
+        encoder_attention_mask: tf.Tensor,
+        past_key_value: tuple[tf.Tensor],
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        self_outputs = self.self_attention(
+            hidden_states=input_tensor,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = self.dense_output(
+            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
+        )
+        # add attentions (possibly with past_key_value) if we output them
+        outputs = (attention_output,) + self_outputs[1:]
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attention", None) is not None:
+            with tf.name_scope(self.self_attention.name):
+                self.self_attention.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Camembert
+class TFCamembertIntermediate(keras.layers.Layer):
+    def __init__(self, config: CamembertConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Camembert
+class TFCamembertOutput(keras.layers.Layer):
+    def __init__(self, config: CamembertConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Camembert
+class TFCamembertLayer(keras.layers.Layer):
+    def __init__(self, config: CamembertConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.attention = TFCamembertAttention(config, name="attention")
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = TFCamembertAttention(config, name="crossattention")
+        self.intermediate = TFCamembertIntermediate(config, name="intermediate")
+        self.bert_output = TFCamembertOutput(config, name="output")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: tuple[tf.Tensor] | None,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            input_tensor=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=self_attn_past_key_value,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                input_tensor=attention_output,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        intermediate_output = self.intermediate(hidden_states=attention_output)
+        layer_output = self.bert_output(
+            hidden_states=intermediate_output, input_tensor=attention_output, training=training
+        )
+        outputs = (layer_output,) + outputs  # add attentions if we output them
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "bert_output", None) is not None:
+            with tf.name_scope(self.bert_output.name):
+                self.bert_output.build(None)
+        if getattr(self, "crossattention", None) is not None:
+            with tf.name_scope(self.crossattention.name):
+                self.crossattention.build(None)
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Camembert
+class TFCamembertEncoder(keras.layers.Layer):
+    def __init__(self, config: CamembertConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.layer = [TFCamembertLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: tuple[tuple[tf.Tensor]] | None,
+        use_cache: bool | None,
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]:
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask[i],
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention and encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
+            )
+
+        return TFBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->Camembert
+class TFCamembertMainLayer(keras.layers.Layer):
+    config_class = CamembertConfig
+
+    def __init__(self, config, add_pooling_layer=True, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.is_decoder = config.is_decoder
+
+        self.num_hidden_layers = config.num_hidden_layers
+        self.initializer_range = config.initializer_range
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.return_dict = config.use_return_dict
+        self.encoder = TFCamembertEncoder(config, name="encoder")
+        self.pooler = TFCamembertPooler(config, name="pooler") if add_pooling_layer else None
+        # The embeddings must be the last declaration in order to follow the weights order
+        self.embeddings = TFCamembertEmbeddings(config, name="embeddings")
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings
+    def get_input_embeddings(self) -> keras.layers.Layer:
+        return self.embeddings
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings
+    def set_input_embeddings(self, value: tf.Variable):
+        self.embeddings.weight = value
+        self.embeddings.vocab_size = shape_list(value)[0]
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    @unpack_inputs
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions | tuple[tf.Tensor]:
+        if not self.config.is_decoder:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+
+        if past_key_values is None:
+            past_key_values_length = 0
+            past_key_values = [None] * len(self.encoder.layer)
+        else:
+            past_key_values_length = shape_list(past_key_values[0][0])[-2]
+
+        if attention_mask is None:
+            attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1)
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+            training=training,
+        )
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        attention_mask_shape = shape_list(attention_mask)
+
+        mask_seq_length = seq_length + past_key_values_length
+        # Copied from `modeling_tf_t5.py`
+        # Provided a padding mask of dimensions [batch_size, mask_seq_length]
+        # - if the model is a decoder, apply a causal mask in addition to the padding mask
+        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+        if self.is_decoder:
+            seq_ids = tf.range(mask_seq_length)
+            causal_mask = tf.less_equal(
+                tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)),
+                seq_ids[None, :, None],
+            )
+            causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype)
+            extended_attention_mask = causal_mask * attention_mask[:, None, :]
+            attention_mask_shape = shape_list(extended_attention_mask)
+            extended_attention_mask = tf.reshape(
+                extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2])
+            )
+            if past_key_values[0] is not None:
+                # attention_mask needs to be sliced to the shape `[batch_size, 1, from_seq_length - cached_seq_length, to_seq_length]
+                extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :]
+        else:
+            extended_attention_mask = tf.reshape(
+                attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1])
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype)
+        one_cst = tf.constant(1.0, dtype=embedding_output.dtype)
+        ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype)
+        extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
+
+        # Copied from `modeling_tf_t5.py` with -1e9 -> -10000
+        if self.is_decoder and encoder_attention_mask is not None:
+            # If a 2D ou 3D attention mask is provided for the cross-attention
+            # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+            # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=extended_attention_mask.dtype)
+            num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
+            if num_dims_encoder_attention_mask == 3:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+            if num_dims_encoder_attention_mask == 2:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+
+            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+            # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
+            #                                         tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
+
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build(None)
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+
+
+class TFCamembertPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = CamembertConfig
+    base_model_prefix = "roberta"
+
+
+@add_start_docstrings(
+    "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.",
+    CAMEMBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaModel with Roberta->Camembert, ROBERTA->CAMEMBERT
+class TFCamembertModel(TFCamembertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.roberta = TFCamembertMainLayer(config, name="roberta")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> tuple | TFBaseModelOutputWithPoolingAndCrossAttentions:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`). Set to `False` during training, `True` during generation
+        """
+        outputs = self.roberta(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Camembert
+class TFCamembertLMHead(keras.layers.Layer):
+    """Camembert Head for masked language modeling."""
+
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.dense = keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.act = get_tf_activation("gelu")
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = input_embeddings
+
+    def build(self, input_shape=None):
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])
+
+    def get_output_embeddings(self):
+        return self.decoder
+
+    def set_output_embeddings(self, value):
+        self.decoder.weight = value
+        self.decoder.vocab_size = shape_list(value)[0]
+
+    def get_bias(self):
+        return {"bias": self.bias}
+
+    def set_bias(self, value):
+        self.bias = value["bias"]
+        self.config.vocab_size = shape_list(value["bias"])[0]
+
+    def call(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+
+        # project back to size of vocabulary with bias
+        seq_length = shape_list(tensor=hidden_states)[1]
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
+        hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
+        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
+
+        return hidden_states
+
+
+@add_start_docstrings(
+    """CamemBERT Model with a `language modeling` head on top.""",
+    CAMEMBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT
+class TFCamembertForMaskedLM(TFCamembertPreTrainedModel, TFMaskedLanguageModelingLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
+        self.lm_head = TFCamembertLMHead(config, self.roberta.embeddings, name="lm_head")
+
+    def get_lm_head(self):
+        return self.lm_head
+
+    def get_prefix_bias_name(self):
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.lm_head.name
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<mask>",
+        expected_output="' Paris'",
+        expected_loss=0.1,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFMaskedLMOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMaskedLMOutput(
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build(None)
+
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead
+class TFCamembertClassificationHead(keras.layers.Layer):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = keras.layers.Dropout(classifier_dropout)
+        self.out_proj = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
+        )
+        self.config = config
+
+    def call(self, features, training=False):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x, training=training)
+        x = self.dense(x)
+        x = self.dropout(x, training=training)
+        x = self.out_proj(x)
+        return x
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    CAMEMBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification with Roberta->Camembert, ROBERTA->CAMEMBERT
+class TFCamembertForSequenceClassification(TFCamembertPreTrainedModel, TFSequenceClassificationLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
+        self.classifier = TFCamembertClassificationHead(config, name="classifier")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="cardiffnlp/twitter-roberta-base-emotion",
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="'optimism'",
+        expected_loss=0.08,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output, training=training)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build(None)
+
+
+@add_start_docstrings(
+    """
+    CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    """,
+    CAMEMBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForTokenClassification with Roberta->Camembert, ROBERTA->CAMEMBERT
+class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClassificationLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
+    _keys_to_ignore_on_load_missing = [r"dropout"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = keras.layers.Dropout(classifier_dropout)
+        self.classifier = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="ydshieh/roberta-large-ner-english",
+        output_type=TFTokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
+        expected_loss=0.01,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFTokenClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=training)
+        logits = self.classifier(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    CamemBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    CAMEMBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMultipleChoice with Roberta->Camembert, ROBERTA->CAMEMBERT
+class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"lm_head"]
+    _keys_to_ignore_on_load_missing = [r"dropout"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.roberta = TFCamembertMainLayer(config, name="roberta")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(
+        CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFMultipleChoiceModelOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
+            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
+        """
+
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+        outputs = self.roberta(
+            flat_input_ids,
+            flat_attention_mask,
+            flat_token_type_ids,
+            flat_position_ids,
+            head_mask,
+            inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output, training=training)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    CAMEMBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForQuestionAnswering with Roberta->Camembert, ROBERTA->CAMEMBERT
+class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsweringLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
+        self.qa_outputs = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint="ydshieh/roberta-base-squad2",
+        output_type=TFQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output="' puppet'",
+        expected_loss=0.86,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]:
+        r"""
+        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.hf_compute_loss(labels, (start_logits, end_logits))
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT
+class TFCamembertForCausalLM(TFCamembertPreTrainedModel, TFCausalLanguageModelingLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]
+
+    def __init__(self, config: CamembertConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `TFCamembertLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta")
+        self.lm_head = TFCamembertLMHead(config, input_embeddings=self.roberta.embeddings, name="lm_head")
+
+    def get_lm_head(self):
+        return self.lm_head
+
+    def get_prefix_bias_name(self):
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.lm_head.name
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = tf.ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFCausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFCausalLMOutputWithCrossAttentions | tuple[tf.Tensor]:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`). Set to `False` during training, `True` during generation
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        outputs = self.roberta(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.lm_head(hidden_states=sequence_output, training=training)
+        loss = None
+
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            shifted_logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.hf_compute_loss(labels=labels, logits=shifted_logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFCausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta", None) is not None:
+            with tf.name_scope(self.roberta.name):
+                self.roberta.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build(None)
+
+
+__all__ = [
+    "TFCamembertForCausalLM",
+    "TFCamembertForMaskedLM",
+    "TFCamembertForMultipleChoice",
+    "TFCamembertForQuestionAnswering",
+    "TFCamembertForSequenceClassification",
+    "TFCamembertForTokenClassification",
+    "TFCamembertModel",
+    "TFCamembertPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd6e399f208df858d647dce23fe0ad74248d80b8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert.py
@@ -0,0 +1,323 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+"""Tokenization classes for Camembert model."""
+
+import os
+from shutil import copyfile
+from typing import Any, Optional
+
+import sentencepiece as spm
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+from ...utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
+
+
+SPIECE_UNDERLINE = "▁"
+
+
+@requires(backends=("sentencepiece",))
+class CamembertTokenizer(PreTrainedTokenizer):
+    """
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Construct a CamemBERT tokenizer. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (`list[str]`, *optional*, defaults to `['<s>NOTUSED', '</s>NOTUSED', '<unk>NOTUSED']`):
+            Additional special tokens used by the tokenizer.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED", "<unk>NOTUSED"],
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = (
+            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False, special=True)
+            if isinstance(mask_token, str)
+            else mask_token
+        )
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+
+        # HACK: These tokens were added by the author for an obscure reason as they were already part of the
+        # sentencepiece vocabulary (this is the case for <s> and </s> and <unk>).
+        # In this case it is recommended to properly set the tokens by hand.
+        self._added_tokens_decoder = {
+            0: AddedToken("<s>NOTUSED", special=True),
+            1: AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token,
+            2: AddedToken("</s>NOTUSED", special=True),
+            3: AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token,
+            4: AddedToken("<unk>NOTUSED", special=True),
+        }
+
+        self.fairseq_offset = 4  # 3 tokens are newly added, but the offset starts from 4
+
+        # legacy: camemebert is a particular case were we have to make sure `"<unk>NOTUSED"` is here
+        if "added_tokens_decoder" in kwargs:
+            # this is the only class that requires this unfortunately.....
+            # the reason is that the fast version has a whole.
+            kwargs["added_tokens_decoder"].update(self._added_tokens_decoder)
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        # The length of the vocabulary without added tokens is len(self.sp_model) but the added tokens are added at the beginning.
+        return len(self.sp_model)
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.fairseq_offset)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text: str) -> list[str]:
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        # specific to camembert, both 3 and 4 point to the unk token.
+        if self.sp_model.PieceToId(token) == 0:
+            # Convert sentence piece unk token to fairseq unk token index
+            return self.unk_token_id
+        return self.fairseq_offset + self.sp_model.PieceToId(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        # TODO decode outputs do not match between fast and slow
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string.strip()
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An CamemBERT sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like
+        RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+
+__all__ = ["CamembertTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..423058ed959aeb3e7122fb3730a6d0f1f57b5982
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/camembert/tokenization_camembert_fast.py
@@ -0,0 +1,197 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+"""Fast tokenization classes for Camembert model."""
+
+import os
+from shutil import copyfile
+from typing import Optional
+
+from ...tokenization_utils import AddedToken
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import is_sentencepiece_available, logging
+
+
+if is_sentencepiece_available():
+    from .tokenization_camembert import CamembertTokenizer
+else:
+    CamembertTokenizer = None
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
+
+
+SPIECE_UNDERLINE = "▁"
+
+
+class CamembertTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
+    [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (`list[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = CamembertTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED", "<unk>NOTUSED"],
+        **kwargs,
+    ):
+        # Mask token behave like a normal word, i.e. include the space before it. Will have normalized = False
+        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An CamemBERT sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like
+        RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+
+__all__ = ["CamembertTokenizerFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ad11a90a24bc4e8c9fd744bca6297e5388fd52e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_chameleon import *
+    from .image_processing_chameleon import *
+    from .image_processing_chameleon_fast import *
+    from .modeling_chameleon import *
+    from .processing_chameleon import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/configuration_chameleon.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/configuration_chameleon.py
new file mode 100644
index 0000000000000000000000000000000000000000..34436a5288c8187d893d7ca4775b812b4c4d7961
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/configuration_chameleon.py
@@ -0,0 +1,282 @@
+# coding=utf-8
+# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""chameleon model configuration"""
+
+from typing import Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ChameleonVQVAEConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ChameleonVQModel`]. It is used to instantiate a
+    `ChameleonVQModel` according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information. Instantiating a
+    configuration with the defaults will yield a similar configuration to the VQModel of the
+    [meta/chameleon-7B](https://huggingface.co/meta/chameleon-7B).
+
+    Args:
+        embed_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of each embedding vector.
+        num_embeddings (`int`, *optional*, defaults to 8192):
+            Number of codebook embeddings.
+        double_latent (`bool`, *optional*, defaults to `False`):
+            Whether to use double z channels.
+        latent_channels (`int`, *optional*, defaults to 256):
+            Number of channels for the latent space.
+        resolution (`int`, *optional*, defaults to 512):
+            Resolution of the input images.
+        in_channels (`int`, *optional*, defaults to 3):
+            Number of input channels.
+        base_channels (`int`, *optional*, defaults to 128):
+            Base channel count.
+        channel_multiplier (`list[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`):
+            Channel multipliers for each resolution.
+        num_res_blocks (`int`, *optional*, defaults to 2):
+            Number of residual blocks.
+        attn_resolutions (`list[int]`, *optional*):
+            Resolutions to apply attention.
+        dropout (`float`, *optional*, defaults to 0.0):
+            Dropout rate.
+        attn_type (`str`, *optional*, defaults to `"vanilla"`):
+            Attention type used in VQ-GAN encoder. Can be "vanilla" or None.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    """
+
+    model_type = "chameleon_vqgan"
+    base_config_key = "vq_config"
+
+    def __init__(
+        self,
+        embed_dim: int = 256,
+        num_embeddings: int = 8192,
+        double_latent: bool = False,
+        latent_channels: int = 256,
+        resolution: int = 512,
+        in_channels: int = 3,
+        base_channels: int = 128,
+        channel_multiplier: list[int] = [1, 1, 2, 2, 4],
+        num_res_blocks: int = 2,
+        attn_resolutions: Optional[list[int]] = None,
+        dropout: float = 0.0,
+        attn_type: str = "vanilla",
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.num_embeddings = num_embeddings
+        self.double_latent = double_latent
+        self.latent_channels = latent_channels
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.base_channels = base_channels
+        self.channel_multiplier = channel_multiplier
+        self.num_res_blocks = num_res_blocks
+        self.attn_resolutions = attn_resolutions
+        self.dropout = dropout
+        self.attn_type = attn_type
+        self.initializer_range = initializer_range
+
+
+class ChameleonConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ChameleonModel`]. It is used to instantiate a
+    chameleon model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [meta/chameleon-7B](https://huggingface.co/meta/chameleon-7B).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65536):
+            Vocabulary size of the chameleon model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ChameleonModel`]; this includes text and image tokens.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with. Chameleon supports up to 4096 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/Localchameleon/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        model_parallel_size (`int`, *optional*, defaults to 1):
+            Number of shards used when training the model. This will be used in qk layernorm because the original Chameleon inference
+            doesn't do reduction in those layers and each rank has its own biases.
+        swin_norm (`bool`, *optional*, defaults to `False`):
+            Use Swin Transformer normalization.
+        vq_config (`dict`, *optional*):
+            ChameleonVQConfig instance containing the configuration for the VQ-VAE model.
+        vocabulary_map (`dict`, *optional*):
+            A dictionary containing the vocabulary map from the tokenizer. Used to obtain tokens from the image inputs.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+
+
+    ```python
+    >>> from transformers import ChameleonModel, ChameleonConfig
+
+    >>> # Initializing a chameleon chameleon-7b style configuration
+    >>> configuration = ChameleonConfig()
+
+    >>> # Initializing a model from the chameleon-7b style configuration
+    >>> model = ChameleonModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "chameleon"
+    sub_configs = {"vq_config": ChameleonVQVAEConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=65536,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        model_parallel_size=1,
+        swin_norm=False,
+        vq_config=None,
+        vocabulary_map=None,
+        mlp_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_bias = mlp_bias
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.model_parallel_size = model_parallel_size
+        self.swin_norm = swin_norm
+
+        if vq_config is None:
+            vq_config = {}
+            logger.info("vq_config is None. initializing the ChameleonVQConfig with default values.")
+
+        self.vq_config = ChameleonVQVAEConfig(**vq_config)
+
+        self.vocabulary_map = vocabulary_map
+        self.image_token_id = vocabulary_map.get("<image>") if vocabulary_map is not None else None
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+
+
+__all__ = ["ChameleonConfig", "ChameleonVQVAEConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cae9d7bdd34d8b7a2292059ae1e92b587891272
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon.py
@@ -0,0 +1,341 @@
+# coding=utf-8
+# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Chameleon."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import get_resize_output_image_size, resize, to_channel_dimension_format
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+if is_vision_available():
+    import PIL
+
+
+class ChameleonImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Chameleon image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 512}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to 1):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`dict[str, int]` *optional*, defaults to {"height": 512, "width": 512}):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to 0.0078):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `[1.0, 1.0, 1.0]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `[1.0, 1.0, 1.0]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PIL.Image.LANCZOS,
+        do_center_crop: bool = True,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 0.0078,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 512}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 512, "width": 512}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [1.0, 1.0, 1.0]
+        self.image_std = image_std if image_std is not None else [1.0, 1.0, 1.0]
+        self.do_convert_rgb = do_convert_rgb
+
+    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
+        output_size = get_resize_output_image_size(
+            image,
+            size=size,
+            default_to_square=default_to_square,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        images = self.fetch_images(images)
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if do_convert_rgb:
+            images = [self.blend_rgba(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        all_images = []
+        for image in images:
+            if do_resize:
+                image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+            if do_center_crop:
+                image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            all_images.append(image)
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            for image in all_images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def blend_rgba(self, image: ImageInput) -> ImageInput:
+        """
+        Convert image to RGB by blending the transparency layer if it's in RGBA format.
+        If image is not `PIL.Image`, it si simply returned without modifications.
+
+        Args:
+            image (`ImageInput`):
+                Image to convert.
+        """
+
+        if not isinstance(image, PIL.Image.Image):
+            return image
+        elif image.mode == "RGB":
+            return image
+
+        img_rgba = np.array(image.convert("RGBA"))
+
+        # If there is no transparency layer, simple convert and return.
+        if not (img_rgba[:, :, 3] < 255).any():
+            return image.convert("RGB")
+
+        # There is a transparency layer, blend it with a white background.
+        # Calculate the alpha proportion for blending.
+        alpha = img_rgba[:, :, 3] / 255.0
+        img_rgb = (1 - alpha[:, :, np.newaxis]) * 255 + alpha[:, :, np.newaxis] * img_rgba[:, :, :3]
+        return PIL.Image.fromarray(img_rgb.astype("uint8"), "RGB")
+
+
+__all__ = ["ChameleonImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d102614f7df3700845c00f9d8bfa217930c776b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/image_processing_chameleon_fast.py
@@ -0,0 +1,112 @@
+# coding=utf-8
+# Copyright 2025 Meta Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for Chameleon."""
+
+from typing import Optional
+
+import numpy as np
+import PIL
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils_fast import BaseImageProcessorFast
+from ...image_utils import ImageInput, PILImageResampling, SizeDict
+from ...utils import auto_docstring, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+@auto_docstring
+class ChameleonImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.LANCZOS
+    image_mean = [1.0, 1.0, 1.0]
+    image_std = [1.0, 1.0, 1.0]
+    size = {"shortest_edge": 512}
+    default_to_square = False
+    crop_size = {"height": 512, "width": 512}
+    do_resize = True
+    do_center_crop = True
+    do_rescale = True
+    rescale_factor = 0.0078
+    do_normalize = True
+    do_convert_rgb = True
+
+    def convert_to_rgb(self, image: ImageInput) -> ImageInput:
+        """
+        Convert image to RGB by blending the transparency layer if it's in RGBA format.
+        If image is not `PIL.Image`, it si simply returned without modifications.
+
+        Args:
+            image (`ImageInput`):
+                Image to convert.
+        """
+
+        if not isinstance(image, PIL.Image.Image):
+            return image
+        elif image.mode == "RGB":
+            return image
+
+        img_rgba = np.array(image.convert("RGBA"))
+
+        # If there is no transparency layer, simple convert and return.
+        if not (img_rgba[:, :, 3] < 255).any():
+            return image.convert("RGB")
+
+        # There is a transparency layer, blend it with a white background.
+        # Calculate the alpha proportion for blending.
+        alpha = img_rgba[:, :, 3] / 255.0
+        img_rgb = (1 - alpha[:, :, np.newaxis]) * 255 + alpha[:, :, np.newaxis] * img_rgba[:, :, :3]
+        return PIL.Image.fromarray(img_rgb.astype("uint8"), "RGB")
+
+    def resize(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"] = None,
+        **kwargs,
+    ) -> "torch.Tensor":
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
+
+        Returns:
+            `torch.Tensor`: The resized image.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        if interpolation == F.InterpolationMode.LANCZOS:
+            logger.warning_once(
+                "You have used fast image processor with LANCZOS resample which not yet supported for torch.Tensor. "
+                "BICUBIC resample will be used as an alternative. Please fall back to slow image processor if you "
+                "want full consistency with the original model."
+            )
+            interpolation = F.InterpolationMode.BICUBIC
+
+        return super().resize(
+            image=image,
+            size=size,
+            interpolation=interpolation,
+            **kwargs,
+        )
+
+
+__all__ = ["ChameleonImageProcessorFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/processing_chameleon.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/processing_chameleon.py
new file mode 100644
index 0000000000000000000000000000000000000000..d481a62b6fc6608bd3088e4766b91ff843680ea0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/processing_chameleon.py
@@ -0,0 +1,196 @@
+# coding=utf-8
+# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Chameleon.
+"""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import (
+    MultiModalData,
+    ProcessingKwargs,
+    ProcessorMixin,
+    TextKwargs,
+    Unpack,
+)
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+class ChameleonTextKwargs(TextKwargs, total=False):
+    return_for_text_completion: bool
+
+
+class ChameleonProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: ChameleonTextKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_for_text_completion": False,
+            "return_mm_token_type_ids": False,
+        },
+        "common_kwargs": {
+            "return_tensors": "pt",
+        },
+    }
+
+
+class ChameleonProcessor(ProcessorMixin):
+    r"""
+    Constructs a Chameleon processor which wraps a Chameleon image processor and a Chameleon tokenizer into a single
+    processor.
+
+    [`ChameleonProcessor`] offers all the functionalities of [`ChameleonImageProcessor`] and [`LlamaTokenizerFast`].
+    See the [`~ChameleonProcessor.__call__`] and [`~ChameleonProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`ChameleonImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`]):
+            The tokenizer is a required input.
+        image_seq_length (`int`, *optional*, defaults to 1024):
+            Sequence length of one image embedding.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            The special token used to indicate image in the text.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    image_processor_class = "ChameleonImageProcessor"
+
+    def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):
+        self.image_seq_length = image_seq_length
+        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+        self.image_start_token = (
+            tokenizer.boi_token if hasattr(tokenizer, "boi_token") else "<racm3:break>"
+        )  # fixed tokens for start and end, so can hardcode
+        self.image_end_token = tokenizer.eoi_token if hasattr(tokenizer, "eoi_token") else "<eoss>"
+        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+        self.image_start_token_id = tokenizer.convert_tokens_to_ids(self.image_start_token)
+        self.image_end_token_id = tokenizer.convert_tokens_to_ids(self.image_end_token)
+        self.image_ids = [self.image_token_id, self.image_start_token_id, self.image_end_token_id]
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[ChameleonProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise TypeError("Invalid input text. Please provide a string, or a list of strings")
+        if text is None and images is None:
+            raise ValueError("You must provide either text or images")
+
+        output_kwargs = self._merge_kwargs(
+            ChameleonProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        return_for_text_completion = output_kwargs["text_kwargs"].pop("return_for_text_completion", False)
+
+        # Replace the image token with the expanded image token sequence
+        prompt_strings = []
+        one_img_tokens = self.image_start_token + (self.image_token * self.image_seq_length) + self.image_end_token
+        for sample in text:
+            sample = sample.replace(self.image_token, one_img_tokens)
+            if not return_for_text_completion:
+                sample += self.tokenizer.sep_token  # special Chameleon treatment to add sep for chat mode
+            prompt_strings.append(sample)
+
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None)
+        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[np.isin(array_ids, self.image_ids)] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            # add 2 for BOI and EOI tokens
+            num_image_tokens = [self.image_seq_length + 2] * len(image_sizes)
+            num_image_patches = [1] * len(image_sizes)
+
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
+
+__all__ = ["ChameleonProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d54ee86aecef2cbe5b9bfdee321a0375d977880
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_clap import *
+    from .feature_extraction_clap import *
+    from .modeling_clap import *
+    from .processing_clap import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/configuration_clap.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/configuration_clap.py
new file mode 100644
index 0000000000000000000000000000000000000000..900e8d373f5ad15d3d90f2da655abe10f3279f85
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/configuration_clap.py
@@ -0,0 +1,382 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""CLAP model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ClapTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ClapTextModel`]. It is used to instantiate a CLAP
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the CLAP
+    [calp-hsat-fused](https://huggingface.co/laion/clap-hsat-fused) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the CLAP model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ClapTextModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"relu"`,
+            `"relu"`, `"silu"` and `"relu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`ClapTextModel`].
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        projection_dim (`int`, *optional*, defaults to 512)
+            Dimension of the projection head of the `ClapTextModelWithProjection`.
+
+    Examples:
+
+    ```python
+    >>> from transformers import ClapTextConfig, ClapTextModel
+
+    >>> # Initializing a CLAP text configuration
+    >>> configuration = ClapTextConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = ClapTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "clap_text_model"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+        initializer_factor=1.0,
+        layer_norm_eps=1e-12,
+        projection_dim=512,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        projection_hidden_act="relu",
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_factor = initializer_factor
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.projection_hidden_act = projection_hidden_act
+        self.projection_dim = projection_dim
+
+
+class ClapAudioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ClapAudioModel`]. It is used to instantiate a
+    CLAP audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the audio encoder of the CLAP
+    [laion/clap-htsat-fused](https://huggingface.co/laion/clap-htsat-fused) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        window_size (`int`, *optional*, defaults to 8):
+            Image size of the spectrogram
+        num_mel_bins (`int`, *optional*, defaults to 64):
+            Number of mel features used per frames. Should correspond to the value used in the `ClapProcessor` class.
+        spec_size (`int`, *optional*, defaults to 256):
+            Desired input size of the spectrogram that the model supports. It can be different from the output of the
+            `ClapFeatureExtractor`, in which case the input features will be resized. Corresponds to the `image_size`
+            of the audio models.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        patch_size (`int`, *optional*, defaults to 4):
+            Patch size for the audio spectrogram
+        patch_stride (`list`, *optional*, defaults to `[4, 4]`):
+            Patch stride for the audio spectrogram
+        num_classes (`int`, *optional*, defaults to 527):
+            Number of classes used for the head training
+        hidden_size (`int`, *optional*, defaults to 768):
+            Hidden size of the output of the audio encoder. Correspond to the dimension of the penultimate layer's
+            output,which is sent to the projection MLP layer.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Hidden size of the projection layer.
+        depths (`list`, *optional*, defaults to `[2, 2, 6, 2]`):
+            Depths used for the Swin Layers of the audio model
+        num_attention_heads (`list`, *optional*, defaults to `[4, 8, 16, 32]`):
+            Number of attention heads used for the Swin Layers of the audio model
+        enable_fusion (`bool`, *optional*, defaults to `False`):
+            Whether or not to enable patch fusion. This is the main contribution of the authors, and should give the
+            best results.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the encoder.
+        fusion_type (`[type]`, *optional*):
+            Fusion type used for the patch fusion.
+        patch_embed_input_channels (`int`, *optional*, defaults to 1):
+            Number of channels used for the input spectrogram
+        flatten_patch_embeds (`bool`, *optional*, defaults to `True`):
+            Whether or not to flatten the patch embeddings
+        patch_embeds_hidden_size (`int`, *optional*, defaults to 96):
+            Hidden size of the patch embeddings. It is used as the number of output channels.
+        enable_patch_layer_norm (`bool`, *optional*, defaults to `True`):
+            Whether or not to enable layer normalization for the patch embeddings
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Drop path rate for the patch fusion
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to add a bias to the query, key, value projections.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of the mlp hidden dim to embedding dim.
+        aff_block_r (`int`, *optional*, defaults to 4):
+            downsize_ratio used in the AudioFF block
+        num_hidden_layers (`int`, *optional*, defaults to 4):
+            Number of hidden layers in the Transformer encoder.
+        projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        layer_norm_eps (`[type]`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import ClapAudioConfig, ClapAudioModel
+
+    >>> # Initializing a ClapAudioConfig with laion/clap-htsat-fused style configuration
+    >>> configuration = ClapAudioConfig()
+
+    >>> # Initializing a ClapAudioModel (with random weights) from the laion/clap-htsat-fused style configuration
+    >>> model = ClapAudioModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "clap_audio_model"
+    base_config_key = "audio_config"
+
+    def __init__(
+        self,
+        window_size=8,
+        num_mel_bins=64,
+        spec_size=256,
+        hidden_act="gelu",
+        patch_size=4,
+        patch_stride=[4, 4],
+        num_classes=527,
+        hidden_size=768,
+        projection_dim=512,
+        depths=[2, 2, 6, 2],
+        num_attention_heads=[4, 8, 16, 32],
+        enable_fusion=False,
+        hidden_dropout_prob=0.1,
+        fusion_type=None,
+        patch_embed_input_channels=1,
+        flatten_patch_embeds=True,
+        patch_embeds_hidden_size=96,
+        enable_patch_layer_norm=True,
+        drop_path_rate=0.0,
+        attention_probs_dropout_prob=0.0,
+        qkv_bias=True,
+        mlp_ratio=4.0,
+        aff_block_r=4,
+        num_hidden_layers=4,
+        projection_hidden_act="relu",
+        layer_norm_eps=1e-5,
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.window_size = window_size
+        self.num_mel_bins = num_mel_bins
+        self.spec_size = spec_size
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.num_classes = num_classes
+        self.hidden_size = hidden_size
+        self.depths = depths
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.window_size = window_size
+        self.enable_fusion = enable_fusion
+        self.fusion_type = fusion_type
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.projection_dim = projection_dim
+        self.flatten_patch_embeds = flatten_patch_embeds
+        self.patch_embeds_hidden_size = patch_embeds_hidden_size
+        self.enable_patch_layer_norm = enable_patch_layer_norm
+        self.drop_path_rate = drop_path_rate
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.qkv_bias = qkv_bias
+        self.mlp_ratio = mlp_ratio
+        self.patch_embed_input_channels = patch_embed_input_channels
+        self.aff_block_r = aff_block_r
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_factor = initializer_factor
+        self.projection_hidden_act = projection_hidden_act
+
+
+class ClapConfig(PretrainedConfig):
+    r"""
+    [`ClapConfig`] is the configuration class to store the configuration of a [`ClapModel`]. It is used to instantiate
+    a CLAP model according to the specified arguments, defining the text model and audio model configs. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the CLAP
+    [laion/clap-htsat-fused](https://huggingface.co/laion/clap-htsat-fused) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`ClapTextConfig`].
+        audio_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`ClapAudioConfig`].
+        logit_scale_init_value (`float`, *optional*, defaults to 14.29):
+            The initial value of the *logit_scale* parameter. Default is used as per the original CLAP implementation.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and audio projection layers.
+        projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
+            Activation function for the projection layers.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            Factor to scale the initialization of the model weights.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import ClapConfig, ClapModel
+
+    >>> # Initializing a ClapConfig with laion-ai/base style configuration
+    >>> configuration = ClapConfig()
+
+    >>> # Initializing a ClapModel (with random weights) from the laion-ai/base style configuration
+    >>> model = ClapModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a ClapConfig from a ClapTextConfig and a ClapAudioConfig
+    >>> from transformers import ClapTextConfig, ClapAudioConfig
+
+    >>> # Initializing a ClapText and ClapAudioConfig configuration
+    >>> config_text = ClapTextConfig()
+    >>> config_audio = ClapAudioConfig()
+
+    >>> config = ClapConfig.from_text_audio_configs(config_text, config_audio)
+    ```"""
+
+    model_type = "clap"
+    sub_configs = {"text_config": ClapTextConfig, "audio_config": ClapAudioConfig}
+
+    def __init__(
+        self,
+        text_config=None,
+        audio_config=None,
+        logit_scale_init_value=(1 / 0.07),
+        projection_dim=512,
+        projection_hidden_act="relu",
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the ClapTextConfig with default values.")
+
+        if audio_config is None:
+            audio_config = {}
+            logger.info("audio_config is None. initializing the ClapAudioConfig with default values.")
+
+        self.text_config = ClapTextConfig(**text_config)
+        self.audio_config = ClapAudioConfig(**audio_config)
+        self.text_config.projection_dim = projection_dim
+        self.audio_config.projection_dim = projection_dim
+
+        self.text_config.projection_hidden_act = projection_hidden_act
+        self.audio_config.projection_hidden_act = projection_hidden_act
+
+        self.projection_dim = projection_dim
+        self.projection_hidden_act = projection_hidden_act
+        self.hidden_size = self.text_config.hidden_size
+
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = initializer_factor
+        self.num_hidden_layers = self.text_config.num_hidden_layers + len(self.audio_config.depths)
+
+
+__all__ = ["ClapAudioConfig", "ClapConfig", "ClapTextConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/feature_extraction_clap.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/feature_extraction_clap.py
new file mode 100644
index 0000000000000000000000000000000000000000..33daac615c07ab879d8515e04e09d01fe27b37fb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/feature_extraction_clap.py
@@ -0,0 +1,367 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for CLAP."""
+
+import copy
+from typing import Any, Optional, Union
+
+import numpy as np
+import torch
+
+from ...audio_utils import mel_filter_bank, spectrogram, window_function
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import TensorType, logging
+from ...utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("torch",))
+class ClapFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a CLAP feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the *Short Time
+    Fourier Transform* (STFT) which should match pytorch's `torch.stft` equivalent.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 64):
+            The feature dimension of the extracted Mel spectrograms. This corresponds to the number of mel filters
+            (`n_mels`).
+        sampling_rate (`int`, *optional*, defaults to 48000):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
+            to warn users if the audio fed to the feature extractor does not have the same sampling rate.
+        hop_length (`int`,*optional*, defaults to 480):
+            Length of the overlapping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
+            in smaller `frames` with a step of `hop_length` between each frame.
+        max_length_s (`int`, *optional*, defaults to 10):
+            The maximum input length of the model in seconds. This is used to pad the audio.
+        fft_window_size (`int`, *optional*, defaults to 1024):
+            Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency
+            resolution of the spectrogram. 400 means that the fourier transform is computed on windows of 400 samples.
+        padding_value (`float`, *optional*, defaults to 0.0):
+            Padding value used to pad the audio. Should correspond to silences.
+        return_attention_mask (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the attention masks corresponding to the input.
+        frequency_min (`float`, *optional*, defaults to 0):
+            The lowest frequency of interest. The STFT will not be computed for values below this.
+        frequency_max (`float`, *optional*, defaults to 14000):
+            The highest frequency of interest. The STFT will not be computed for values above this.
+        top_db (`float`, *optional*):
+            The highest decibel value used to convert the mel spectrogram to the log scale. For more details see the
+            `audio_utils.power_to_db` function
+        truncation (`str`, *optional*, defaults to `"fusion"`):
+            Truncation pattern for long audio inputs. Two patterns are available:
+                - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and a
+                  downsampled version of the entire mel spectrogram.
+            If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy
+            of the original mel obtained from the padded audio.
+                - `rand_trunc` will select a random crop of the mel spectrogram.
+        padding (`str`, *optional*, defaults to `"repeatpad"`):
+               Padding pattern for shorter audio inputs. Three patterns were originally implemented:
+                - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
+                - `repeat`: the audio is repeated and then cut to fit the `max_length`
+                - `pad`: the audio is padded.
+    """
+
+    model_input_names = ["input_features", "is_longer"]
+
+    def __init__(
+        self,
+        feature_size=64,
+        sampling_rate=48_000,
+        hop_length=480,
+        max_length_s=10,
+        fft_window_size=1024,
+        padding_value=0.0,
+        return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
+        frequency_min: float = 0,
+        frequency_max: float = 14_000,
+        top_db: Optional[int] = None,
+        truncation: str = "fusion",
+        padding: str = "repeatpad",
+        **kwargs,
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            return_attention_mask=return_attention_mask,
+            **kwargs,
+        )
+        self.top_db = top_db
+        self.truncation = truncation
+        self.padding = padding
+        self.fft_window_size = fft_window_size
+        self.nb_frequency_bins = (fft_window_size >> 1) + 1
+        self.hop_length = hop_length
+        self.max_length_s = max_length_s
+        self.nb_max_samples = max_length_s * sampling_rate
+        self.sampling_rate = sampling_rate
+        self.frequency_min = frequency_min
+        self.frequency_max = frequency_max
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=self.nb_frequency_bins,
+            num_mel_filters=feature_size,
+            min_frequency=frequency_min,
+            max_frequency=frequency_max,
+            sampling_rate=sampling_rate,
+            norm=None,
+            mel_scale="htk",
+        )
+        self.mel_filters_slaney = mel_filter_bank(
+            num_frequency_bins=self.nb_frequency_bins,
+            num_mel_filters=feature_size,
+            min_frequency=frequency_min,
+            max_frequency=frequency_max,
+            sampling_rate=sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+
+    def to_dict(self) -> dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, except for the
+            mel filter banks, which do not need to be saved or printed as they are too long.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["feature_extractor_type"] = self.__class__.__name__
+        if "mel_filters" in output:
+            del output["mel_filters"]
+        if "mel_filters_slaney" in output:
+            del output["mel_filters_slaney"]
+        return output
+
+    def _np_extract_fbank_features(self, waveform: np.ndarray, mel_filters: Optional[np.ndarray] = None) -> np.ndarray:
+        """
+        Compute the log-mel spectrogram of the provided `waveform` using the Hann window. In CLAP, two different filter
+        banks are used depending on the truncation pattern:
+            - `self.mel_filters`: they correspond to the default parameters of `torchaudio` which can be obtained from
+              calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation`
+              is set to `"fusion"`.
+            - `self.mel_filteres_slaney` : they correspond to the default parameters of `librosa` which used
+              `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original
+              implementation when the truncation mode is not `"fusion"`.
+        """
+        log_mel_spectrogram = spectrogram(
+            waveform,
+            window_function(self.fft_window_size, "hann"),
+            frame_length=self.fft_window_size,
+            hop_length=self.hop_length,
+            power=2.0,
+            mel_filters=mel_filters,
+            log_mel="dB",
+        )
+        return log_mel_spectrogram.T
+
+    def _random_mel_fusion(self, mel, total_frames, chunk_frames):
+        ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3)
+        if len(ranges[1]) == 0:
+            # if the audio is too short, we just use the first chunk
+            ranges[1] = [0]
+        if len(ranges[2]) == 0:
+            # if the audio is too short, we just use the first chunk
+            ranges[2] = [0]
+        # randomly choose index for each part
+        idx_front = np.random.choice(ranges[0])
+        idx_middle = np.random.choice(ranges[1])
+        idx_back = np.random.choice(ranges[2])
+
+        mel_chunk_front = mel[idx_front : idx_front + chunk_frames, :]
+        mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :]
+        mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :]
+
+        mel = torch.tensor(mel[None, None, :])
+        mel_shrink = torch.nn.functional.interpolate(
+            mel, size=[chunk_frames, 64], mode="bilinear", align_corners=False
+        )
+        mel_shrink = mel_shrink[0][0].numpy()
+        mel_fusion = np.stack([mel_shrink, mel_chunk_front, mel_chunk_middle, mel_chunk_back], axis=0)
+        return mel_fusion
+
+    def _get_input_mel(self, waveform: np.ndarray, max_length, truncation, padding) -> np.ndarray:
+        """
+        Extracts the mel spectrogram and prepares it for the mode based on the `truncation` and `padding` arguments.
+        Four different path are possible:
+            - `truncation="fusion"` and the length of the waveform is greater than the max length: the mel spectrogram
+              will be computed on the entire audio. 3 random crops and a dowsampled version of the full mel spectrogram
+              are then stacked together. They will later be used for `feature_fusion`.
+            - `truncation="rand_trunc"` and the length of the waveform is smaller than the max length: the audio is
+              padded based on `padding`.
+            - `truncation="fusion"` and the length of the waveform is smaller than the max length: the audio is padded
+              based on `padding`, and is repeated `4` times.
+            - `truncation="rand_trunc"` and the length of the waveform is greater than the max length: the mel
+              spectrogram will be computed on a random crop of the waveform.
+
+        """
+        if waveform.shape[0] > max_length:
+            if truncation == "rand_trunc":
+                longer = True
+                # random crop to max_length (for compatibility) -> this should be handled by self.pad
+                overflow = len(waveform) - max_length
+                idx = np.random.randint(0, overflow + 1)
+                waveform = waveform[idx : idx + max_length]
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)[None, :]
+            elif truncation == "fusion":
+                mel = self._np_extract_fbank_features(waveform, self.mel_filters)
+                chunk_frames = max_length // self.hop_length + 1  # the +1 related to how the spectrogram is computed
+                total_frames = mel.shape[0]
+                if chunk_frames == total_frames:
+                    # there is a corner case where the audio length is larger than max_length but smaller than max_length+hop_length.
+                    # In this case, we just use the whole audio.
+                    input_mel = np.stack([mel, mel, mel, mel], axis=0)
+                    longer = False
+                else:
+                    input_mel = self._random_mel_fusion(mel, total_frames, chunk_frames)
+                    longer = True
+            else:
+                raise NotImplementedError(f"data_truncating {truncation} not implemented")
+
+        else:
+            longer = False
+            # only use repeat as a new possible value for padding. you repeat the audio before applying the usual max_length padding
+            if waveform.shape[0] < max_length:
+                if padding == "repeat":
+                    n_repeat = int(max_length / len(waveform))
+                    waveform = np.tile(waveform, n_repeat + 1)[:max_length]
+                if padding == "repeatpad":
+                    n_repeat = int(max_length / len(waveform))
+                    waveform = np.tile(waveform, n_repeat)
+                waveform = np.pad(waveform, (0, max_length - waveform.shape[0]), mode="constant", constant_values=0)
+
+            if truncation == "fusion":
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters)
+                input_mel = np.stack([input_mel, input_mel, input_mel, input_mel], axis=0)
+            else:
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)[None, :]
+
+        return input_mel, longer
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
+        truncation: Optional[str] = None,
+        padding: Optional[str] = None,
+        max_length: Optional[int] = None,
+        sampling_rate: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
+            truncation (`str`, *optional*):
+                Truncation pattern for long audio inputs. Two patterns are available:
+                    - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and
+                      a downsampled version of the entire mel spectrogram.
+                If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a
+                copy of the original mel obtained from the padded audio.
+                    - `rand_trunc` will select a random crop of the mel spectrogram.
+            padding (`str`, *optional*):
+               Padding pattern for shorter audio inputs. Three patterns were originally implemented:
+                    - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
+                    - `repeat`: the audio is repeated and then cut to fit the `max_length`
+                    - `pad`: the audio is padded.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.np.array` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
+                pipeline.
+        """
+        truncation = truncation if truncation is not None else self.truncation
+        padding = padding if padding else self.padding
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
+                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
+                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_speech = [np.asarray(speech, dtype=np.float64) for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float64)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float64)
+
+        # always return batch
+        if not is_batched:
+            raw_speech = [np.asarray(raw_speech)]
+
+        # convert to mel spectrogram, truncate and pad if needed.
+        padded_inputs = [
+            self._get_input_mel(waveform, max_length if max_length else self.nb_max_samples, truncation, padding)
+            for waveform in raw_speech
+        ]
+
+        input_mel = []
+        is_longer = []
+        for mel, longer in padded_inputs:
+            input_mel.append(mel)
+            is_longer.append(longer)
+
+        if truncation == "fusion" and sum(is_longer) == 0:
+            # if no audio is longer than 10s, then randomly select one audio to be longer
+            rand_idx = np.random.randint(0, len(input_mel))
+            is_longer[rand_idx] = True
+
+        if isinstance(input_mel[0], list):
+            input_mel = [np.asarray(feature, dtype=np.float64) for feature in input_mel]
+
+        # is_longer is a list of bool
+        is_longer = [[longer] for longer in is_longer]
+
+        input_features = {"input_features": input_mel, "is_longer": is_longer}
+        input_features = BatchFeature(input_features)
+
+        if return_tensors is not None:
+            input_features = input_features.convert_to_tensors(return_tensors)
+
+        return input_features
+
+
+__all__ = ["ClapFeatureExtractor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/modeling_clap.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/modeling_clap.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d81a26581dd35dcfcc06b0f1881640acd47a070
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/modeling_clap.py
@@ -0,0 +1,1929 @@
+# coding=utf-8
+# Copyright 2023 The LAION-AI Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch CLAP model."""
+
+import collections
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer
+from ...utils import ModelOutput, auto_docstring, can_return_tuple, filter_out_non_signature_kwargs, logging, torch_int
+from .configuration_clap import ClapAudioConfig, ClapConfig, ClapTextConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Adapted from: https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L191
+def interpolate(hidden_states, ratio):
+    """
+    Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.
+
+    Args:
+        hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
+            Input hidden states
+        ratio (`int`):
+            The ratio of the length of the output to the length of the input.
+    """
+    (batch_size, time_length, classes_num) = hidden_states.shape
+    upsampled = hidden_states[:, :, None, :].repeat(1, 1, ratio, 1)
+    upsampled = upsampled.reshape(batch_size, time_length * ratio, classes_num)
+    return upsampled
+
+
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L249
+def window_partition(hidden_states, window_size):
+    """
+    Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
+    num_channels)`
+
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
+            Input hidden states
+        window_size (`int`):
+            Window size
+    """
+    batch_size, height, width, num_channels = hidden_states.shape
+
+    hidden_states = hidden_states.view(
+        batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
+    )
+    windows = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
+    return windows
+
+
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L263
+def window_reverse(windows, window_size, height, width):
+    """
+    Merges windows to produce higher resolution features.
+    Args:
+        windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
+            Input windows
+        window_size (`int`):
+            Window size
+        height (`int`):
+            Height of the resized audio
+        width (`int`):
+            Width of the resized audio
+    """
+    num_channels = windows.shape[-1]
+    windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels)
+    windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, height, width, num_channels)
+    return windows
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html#CLIP-loss-function
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    labels = torch.arange(len(logits), device=logits.device)
+    return nn.functional.cross_entropy(logits, labels)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+    """
+)
+# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Clap
+class ClapTextModelOutput(ModelOutput):
+    r"""
+    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+        The text embeddings obtained by applying the projection layer to the pooler_output.
+    """
+
+    text_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    ClapAudio model output to mimic the output of the original implementation.
+    """
+)
+class ClapAudioModelOutput(ModelOutput):
+    r"""
+    audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+        The Audio embeddings obtained by applying the projection layer to the pooler_output.
+    """
+
+    audio_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring
+# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Clap, vision->audio, Vision->Audio, image->audio
+class ClapOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+        Contrastive loss for audio-text similarity.
+    logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
+        The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
+        similarity scores.
+    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
+        The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
+        similarity scores.
+    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+        The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
+    audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+        The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
+    text_model_output (`BaseModelOutputWithPooling`):
+        The output of the [`ClapTextModel`].
+    audio_model_output (`BaseModelOutputWithPooling`):
+        The output of the [`ClapAudioModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_audio: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    audio_embeds: Optional[torch.FloatTensor] = None
+    text_model_output: BaseModelOutputWithPooling = None
+    audio_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "audio_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# Adapted from transformers.models.swin.modeling_swin.SwinDropPath
+class ClapDropPath(nn.Module):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
+    refactored version of the `SwinDropPath` implementation.
+    """
+
+    def __init__(self, drop_prob=None):
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states):
+        if self.drop_prob == 0.0 or not self.training:
+            return hidden_states
+
+        keep_prob = 1 - self.drop_prob
+        # work with diff dim tensors, not just 2D ConvNets
+        shape = (hidden_states.shape[0],) + (1,) * (hidden_states.ndim - 1)
+
+        random_tensor = keep_prob + torch.rand(shape, dtype=hidden_states.dtype, device=hidden_states.device)
+        random_tensor.floor_()  # binarize
+        output = hidden_states.div(keep_prob) * random_tensor
+        return output
+
+
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/feature_fusion.py#L133
+class ClapAudioAFFBlock(nn.Module):
+    r"""
+    ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
+    the 1D version.
+    """
+
+    def __init__(self, config: ClapAudioConfig):
+        super().__init__()
+        channels = config.patch_embeds_hidden_size
+        downsize_ratio = config.aff_block_r
+        inter_channels = int(channels // downsize_ratio)
+
+        self.local_att = nn.Sequential(
+            nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(inter_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(channels),
+        )
+        self.global_att = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(inter_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2d(channels),
+        )
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, hidden_states, residual):
+        attention_input = hidden_states + residual
+
+        fused_layer_output = self.local_att(attention_input) + self.global_att(attention_input)
+        fused_layer_output = self.sigmoid(fused_layer_output)
+
+        output = 2 * hidden_states * fused_layer_output + 2 * residual * (1 - fused_layer_output)
+        return output
+
+
+class ClapAudioPatchEmbed(nn.Module):
+    """
+    This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
+    Transformer block.
+    """
+
+    def __init__(self, config: ClapAudioConfig):
+        super().__init__()
+        img_size = (config.spec_size, config.spec_size) if isinstance(config.spec_size, int) else config.spec_size
+        patch_size = (
+            (config.patch_size, config.patch_size) if isinstance(config.patch_size, int) else config.patch_size
+        )
+        patch_stride = (
+            (config.patch_stride, config.patch_stride) if isinstance(config.patch_stride, int) else config.patch_stride
+        )
+
+        self.img_size = img_size
+        self.patch_stride = patch_stride
+
+        self.grid_size = (img_size[0] // patch_stride[0], img_size[1] // patch_stride[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+
+        self.flatten = config.flatten_patch_embeds
+        self.enable_fusion = config.enable_fusion
+
+        padding = ((patch_size[0] - patch_stride[0]) // 2, (patch_size[1] - patch_stride[1]) // 2)
+
+        scale_factor = 4 if (self.enable_fusion) and (config.fusion_type == "channel_map") else 1
+
+        self.proj = nn.Conv2d(
+            config.patch_embed_input_channels * scale_factor,
+            config.patch_embeds_hidden_size,
+            kernel_size=patch_size,
+            stride=patch_stride,
+            padding=padding,
+        )
+
+        self.norm = nn.LayerNorm(config.patch_embeds_hidden_size) if config.enable_patch_layer_norm else nn.Identity()
+        if self.enable_fusion:
+            self.fusion_model = ClapAudioAFFBlock(config)
+            self.mel_conv2d = nn.Conv2d(
+                config.patch_embed_input_channels,
+                config.patch_embeds_hidden_size,
+                kernel_size=(patch_size[0], patch_size[1] * 3),
+                stride=(patch_stride[0], patch_stride[1] * 3),
+                padding=padding,
+            )
+
+    def forward(self, hidden_states, is_longer_idx=None):
+        if self.enable_fusion:
+            # retrieve the last mel as we have transposed the input
+            global_hidden_states = hidden_states[:, 0:1, :, :]
+
+            # global processing
+            batch_size, num_channels, height, width = global_hidden_states.shape
+
+            if height != self.img_size[0] or width != self.img_size[1]:
+                raise ValueError(
+                    f"Input audio size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+                )
+
+            global_hidden_states = self.proj(global_hidden_states)
+            output_width = global_hidden_states.size(-1)
+            if len(is_longer_idx) > 0:
+                # local processing
+                local_hidden_states = hidden_states[is_longer_idx, 1:, :, :].contiguous()
+                batch_size, num_channels, height, width = local_hidden_states.shape
+                local_hidden_states = local_hidden_states.view(batch_size * num_channels, 1, height, width)
+
+                local_hidden_states = self.mel_conv2d(local_hidden_states)
+
+                _, features, height, width = local_hidden_states.shape
+                local_hidden_states = local_hidden_states.view(batch_size, num_channels, features, height, width)
+                local_hidden_states = local_hidden_states.permute((0, 2, 3, 1, 4)).contiguous().flatten(3)
+
+                local_width = local_hidden_states.size(-1)
+                local_hidden_states = torch.nn.functional.pad(
+                    local_hidden_states, (0, output_width - local_width), "constant", 0
+                )
+
+                global_hidden_states[is_longer_idx] = self.fusion_model(
+                    global_hidden_states[is_longer_idx], local_hidden_states
+                )
+            hidden_states = global_hidden_states
+        else:
+            _, _, height, width = hidden_states.shape
+            if height != self.img_size[0] or width != self.img_size[1]:
+                raise ValueError(
+                    f"Input audio size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+                )
+            hidden_states = self.proj(hidden_states)
+
+        if self.flatten:
+            hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinSelfAttention with Swin->ClapAudio
+class ClapAudioSelfAttention(nn.Module):
+    def __init__(self, config, dim, num_heads, window_size):
+        super().__init__()
+        if dim % num_heads != 0:
+            raise ValueError(
+                f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
+            )
+
+        self.num_attention_heads = num_heads
+        self.attention_head_size = int(dim / num_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.window_size = (
+            window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
+        )
+
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
+        )
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        batch_size, dim, num_channels = hidden_states.shape
+        hidden_shape = (batch_size, dim, -1, self.attention_head_size)
+
+        query_layer = self.query(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_layer = self.key(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_layer = self.value(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)]
+        relative_position_bias = relative_position_bias.view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )
+
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
+        attention_scores = attention_scores + relative_position_bias.unsqueeze(0)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in ClapAudioModel forward() function)
+            mask_shape = attention_mask.shape[0]
+            attention_scores = attention_scores.view(
+                batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
+            )
+            attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0)
+            attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput with Swin->ClapAudio
+class ClapAudioSelfOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->ClapAudio
+class ClapAudioAttention(nn.Module):
+    def __init__(self, config, dim, num_heads, window_size):
+        super().__init__()
+        self.self = ClapAudioSelfAttention(config, dim, num_heads, window_size)
+        self.output = ClapAudioSelfOutput(config, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinIntermediate with Swin->ClapAudio
+class ClapAudioIntermediate(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinOutput with Swin->ClapAudio
+class ClapAudioOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinLayer with SwinDropPath->ClapDropPath, Swin->ClapAudio
+class ClapAudioLayer(nn.Module):
+    def __init__(self, config, dim, input_resolution, num_heads, drop_path_rate=0.0, shift_size=0):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.shift_size = shift_size
+        self.window_size = config.window_size
+        self.input_resolution = input_resolution
+        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.attention = ClapAudioAttention(config, dim, num_heads, window_size=self.window_size)
+        self.drop_path = ClapDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.intermediate = ClapAudioIntermediate(config, dim)
+        self.output = ClapAudioOutput(config, dim)
+
+    def set_shift_and_window_size(self, input_resolution):
+        if min(input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = torch_int(0)
+            self.window_size = (
+                torch.min(torch.tensor(input_resolution)) if torch.jit.is_tracing() else min(input_resolution)
+            )
+
+    def get_attn_mask(self, height, width, dtype, device):
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, height, width, 1), dtype=dtype, device=device)
+            height_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            width_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            count = 0
+            for height_slice in height_slices:
+                for width_slice in width_slices:
+                    img_mask[:, height_slice, width_slice, :] = count
+                    count += 1
+
+            mask_windows = window_partition(img_mask, self.window_size)
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, -100.0).masked_fill(attn_mask == 0, 0.0)
+        else:
+            attn_mask = None
+        return attn_mask
+
+    def maybe_pad(self, hidden_states, height, width):
+        pad_right = (self.window_size - width % self.window_size) % self.window_size
+        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
+        pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
+        hidden_states = nn.functional.pad(hidden_states, pad_values)
+        return hidden_states, pad_values
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if not always_partition:
+            self.set_shift_and_window_size(input_dimensions)
+        else:
+            pass
+        height, width = input_dimensions
+        batch_size, _, channels = hidden_states.size()
+        shortcut = hidden_states
+
+        hidden_states = self.layernorm_before(hidden_states)
+
+        hidden_states = hidden_states.view(batch_size, height, width, channels)
+
+        # pad hidden_states to multiples of window size
+        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
+
+        _, height_pad, width_pad, _ = hidden_states.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_hidden_states = hidden_states
+
+        # partition windows
+        hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
+        hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
+        attn_mask = self.get_attn_mask(
+            height_pad, width_pad, dtype=hidden_states.dtype, device=hidden_states_windows.device
+        )
+
+        attention_outputs = self.attention(
+            hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
+        )
+
+        attention_output = attention_outputs[0]
+
+        attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)
+        shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            attention_windows = shifted_windows
+
+        was_padded = pad_values[3] > 0 or pad_values[5] > 0
+        if was_padded:
+            attention_windows = attention_windows[:, :height, :width, :].contiguous()
+
+        attention_windows = attention_windows.view(batch_size, height * width, channels)
+
+        hidden_states = shortcut + self.drop_path(attention_windows)
+
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+        layer_output = hidden_states + self.output(layer_output)
+
+        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
+        return layer_outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->ClapAudio
+class ClapAudioStage(GradientCheckpointingLayer):
+    def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
+        super().__init__()
+        self.config = config
+        self.dim = dim
+        self.blocks = nn.ModuleList(
+            [
+                ClapAudioLayer(
+                    config=config,
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    drop_path_rate=drop_path[i],
+                    shift_size=0 if (i % 2 == 0) else config.window_size // 2,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
+        else:
+            self.downsample = None
+
+        self.pointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        input_dimensions: tuple[int, int],
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        height, width = input_dimensions
+        for i, layer_module in enumerate(self.blocks):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+            )
+
+            hidden_states = layer_outputs[0]
+
+        hidden_states_before_downsampling = hidden_states
+        if self.downsample is not None:
+            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
+            output_dimensions = (height, width, height_downsampled, width_downsampled)
+            hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
+        else:
+            output_dimensions = (height, width, height, width)
+
+        stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)
+
+        if output_attentions:
+            stage_outputs += layer_outputs[1:]
+        return stage_outputs
+
+
+# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging with Swin->ClapAudio
+class ClapAudioPatchMerging(nn.Module):
+    """
+    Patch Merging Layer.
+
+    Args:
+        input_resolution (`tuple[int]`):
+            Resolution of input feature.
+        dim (`int`):
+            Number of input channels.
+        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer class.
+    """
+
+    def __init__(self, input_resolution: tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def maybe_pad(self, input_feature, height, width):
+        should_pad = (height % 2 == 1) or (width % 2 == 1)
+        if should_pad:
+            pad_values = (0, 0, 0, width % 2, 0, height % 2)
+            input_feature = nn.functional.pad(input_feature, pad_values)
+
+        return input_feature
+
+    def forward(self, input_feature: torch.Tensor, input_dimensions: tuple[int, int]) -> torch.Tensor:
+        height, width = input_dimensions
+        # `dim` is height * width
+        batch_size, dim, num_channels = input_feature.shape
+
+        input_feature = input_feature.view(batch_size, height, width, num_channels)
+        # pad input to be divisible by width and height, if needed
+        input_feature = self.maybe_pad(input_feature, height, width)
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_0 = input_feature[:, 0::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_1 = input_feature[:, 1::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_2 = input_feature[:, 0::2, 1::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_3 = input_feature[:, 1::2, 1::2, :]
+        # batch_size height/2 width/2 4*num_channels
+        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
+        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)  # batch_size height/2*width/2 4*C
+
+        input_feature = self.norm(input_feature)
+        input_feature = self.reduction(input_feature)
+
+        return input_feature
+
+
+class ClapAudioEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_layers = len(config.depths)
+
+        self.config = config
+        self.patch_embed = ClapAudioPatchEmbed(config)
+        self.enable_fusion = config.enable_fusion
+        self.patch_stride = self.patch_embed.patch_stride
+        self.spec_size = config.spec_size
+        self.freq_ratio = config.spec_size // config.num_mel_bins
+
+        self.num_features = int(config.patch_embeds_hidden_size * 2 ** (self.num_layers - 1))
+
+        drop_path_rate = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")]
+
+        grid_size = self.patch_embed.grid_size
+        self.input_resolutions = [(grid_size[0] // (2**i), grid_size[1] // (2**i)) for i in range(self.num_layers)]
+
+        self.layers = nn.ModuleList(
+            [
+                ClapAudioStage(
+                    config=config,
+                    dim=int(config.patch_embeds_hidden_size * 2**i_layer),
+                    input_resolution=self.input_resolutions[i_layer],
+                    depth=config.depths[i_layer],
+                    num_heads=config.num_attention_heads[i_layer],
+                    drop_path=drop_path_rate[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                    downsample=ClapAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
+                )
+                for i_layer in range(self.num_layers)
+            ]
+        )
+
+        self.gradient_checkpointing = False
+
+        self.batch_norm = nn.BatchNorm2d(config.num_mel_bins)
+        self.norm = nn.LayerNorm(self.num_features)
+        self.depths = config.depths
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+
+    def reshape_mel2img(self, normalized_input_features):
+        """
+        The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
+        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
+        """
+        _, _, time_length, freq_length = normalized_input_features.shape
+
+        spec_width = int(self.spec_size * self.freq_ratio)
+        spec_height = self.spec_size // self.freq_ratio
+
+        if time_length > spec_width or freq_length > spec_height:
+            raise ValueError("the wav size should be less than or equal to the swin input size")
+
+        # to avoid bicubic zero error
+        if time_length < spec_width:
+            normalized_input_features = nn.functional.interpolate(
+                normalized_input_features, (spec_width, freq_length), mode="bicubic", align_corners=True
+            )
+        if freq_length < spec_height:
+            normalized_input_features = nn.functional.interpolate(
+                normalized_input_features, (time_length, spec_height), mode="bicubic", align_corners=True
+            )
+
+        batch, channels, time, freq = normalized_input_features.shape
+
+        # batch_size, channels, spec_width, spec_height --> batch_size, channels, spec_height * freq_ratio, spec_width // freq_ratio
+        normalized_input_features = normalized_input_features.reshape(
+            batch, channels * self.freq_ratio, time // self.freq_ratio, freq
+        )
+        normalized_input_features = normalized_input_features.permute(0, 1, 3, 2).contiguous()
+        normalized_input_features = normalized_input_features.reshape(
+            batch, channels, freq * self.freq_ratio, time // self.freq_ratio
+        )
+
+        return normalized_input_features
+
+    def forward(
+        self,
+        input_features,
+        is_longer: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        output_hidden_states_before_downsampling: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple, ClapAudioModelOutput]:
+        input_features = input_features.transpose(1, 3)
+        normalized_input_features = self.batch_norm(input_features)
+        normalized_input_features = normalized_input_features.transpose(1, 3)
+
+        is_longer_list_idx = None
+        if self.enable_fusion:
+            is_longer_list = is_longer.to(input_features.device)
+            is_longer_list_idx = torch.where(is_longer_list == 1)[0]
+
+        hidden_states = self.reshape_mel2img(normalized_input_features)
+
+        frames_num = hidden_states.shape[2]
+
+        hidden_states = self.patch_embed(hidden_states, is_longer_list_idx)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        input_dimensions = self.input_resolutions[0]
+
+        if output_hidden_states:
+            batch_size, _, hidden_size = hidden_states.shape
+            # rearrange batch_size (height width) channels -> batch_size channel height width
+            reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+            reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+            all_hidden_states += (hidden_states,)
+            all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        for i, layer_module in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            input_dimensions = self.input_resolutions[i]
+
+            layer_outputs = layer_module(
+                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+            )
+
+            hidden_states = layer_outputs[0]
+
+            hidden_states_before_downsampling = layer_outputs[1]
+            output_dimensions = layer_outputs[2]
+
+            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+
+            if output_hidden_states and output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states_before_downsampling.shape
+                # rearrange batch_size (height width) channels -> batch_size channel height width
+                # here we use the original (not downsampled) height and width
+                reshaped_hidden_state = hidden_states_before_downsampling.view(
+                    batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size
+                )
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states_before_downsampling,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+            elif output_hidden_states and not output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states.shape
+                # rearrange batch_size (height width) channels -> batch_size channel height width
+                reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size)
+                reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+            if output_attentions:
+                all_self_attentions += layer_outputs[3:]
+
+        last_hidden_state = self.norm(hidden_states)
+
+        batch_size, _, n_channels = last_hidden_state.shape
+
+        freq_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
+        temporal_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
+
+        last_hidden_state = (
+            last_hidden_state.permute(0, 2, 1).contiguous().reshape(batch_size, n_channels, freq_shape, temporal_shape)
+        )
+
+        batch_size, n_channels, n_frequencies, n_temp = last_hidden_state.shape
+        # group 2D CNN
+        c_freq_bin = n_frequencies // self.freq_ratio
+        last_hidden_state = last_hidden_state.reshape(
+            batch_size, n_channels, n_frequencies // c_freq_bin, c_freq_bin, n_temp
+        )
+        last_hidden_state = (
+            last_hidden_state.permute(0, 1, 3, 2, 4).contiguous().reshape(batch_size, n_channels, c_freq_bin, -1)
+        )
+        latent_output = self.avgpool(torch.flatten(last_hidden_state, 2))
+        latent_output = torch.flatten(latent_output, 1)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    last_hidden_state,
+                    latent_output,
+                    all_reshaped_hidden_states,
+                    all_self_attentions,
+                ]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=latent_output,
+            hidden_states=all_reshaped_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class ClapProjectionLayer(nn.Module):
+    def __init__(self, config: Union[ClapAudioConfig, ClapTextConfig]):
+        super().__init__()
+        self.config = config
+        hidden_size = config.hidden_size
+        projection_dim = config.projection_dim
+
+        self.linear1 = nn.Linear(hidden_size, projection_dim)
+        self.activation = ACT2FN[config.projection_hidden_act]
+        self.linear2 = nn.Linear(projection_dim, projection_dim)
+
+    def forward(self, hidden_states):
+        hidden_states = self.linear1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.linear2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->ClapText, persistent=False->persistent=True
+class ClapTextEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=True
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=True
+        )
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.align.modeling_align.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask.view(1, -1, 1, 1)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.align.modeling_align.AlignTextSelfAttention with Align->Clap
+class ClapTextSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.attention_dropout = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> tuple[torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.attention_head_size)
+
+        query_states = self.query(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.key(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.value(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            head_mask=head_mask,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class ClapTextSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.align.modeling_align.AlignTextAttention with Align->Clap
+class ClapTextAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = ClapTextSelfAttention(config)
+        self.output = ClapTextSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class ClapTextIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class ClapTextOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.align.modeling_align.AlignTextLayer with Align->Clap
+class ClapTextLayer(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = ClapTextAttention(config)
+        self.intermediate = ClapTextIntermediate(config)
+        self.output = ClapTextOutput(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> tuple[torch.Tensor]:
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.align.modeling_align.AlignTextEncoder with Align->Clap
+class ClapTextEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([ClapTextLayer(config) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    @can_return_tuple
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=layer_head_mask,
+                output_attentions=output_attentions,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class ClapTextPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@auto_docstring
+class ClapPreTrainedModel(PreTrainedModel):
+    config: ClapConfig
+    base_model_prefix = "clap"
+    supports_gradient_checkpointing = False
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+
+        if isinstance(module, ClapTextEmbeddings):
+            module.position_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.token_type_embeddings.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, ClapModel):
+            module.logit_scale_a.data.fill_(math.log(self.config.logit_scale_init_value))
+            module.logit_scale_t.data.fill_(math.log(self.config.logit_scale_init_value))
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, (nn.Conv2d, nn.Linear)):
+            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
+            nn.init.normal_(module.weight, std=in_proj_std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, ClapAudioSelfAttention):
+            module.relative_position_bias_table.data.zero_()
+
+
+class ClapAudioModel(ClapPreTrainedModel):
+    config: ClapAudioConfig
+    main_input_name = "input_features"
+
+    def __init__(self, config: ClapAudioConfig):
+        super().__init__(config)
+        self.audio_encoder = ClapAudioEncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.audio_encoder.patch_embed.proj
+
+    @auto_docstring
+    def forward(
+        self,
+        input_features: Optional[torch.FloatTensor] = None,
+        is_longer: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
+            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
+            the features.
+
+        Examples:
+
+        ```python
+        >>> from datasets import load_dataset
+        >>> from transformers import AutoProcessor, ClapAudioModel
+
+        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
+        >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
+        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")
+
+        >>> inputs = processor(audios=audio_sample, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        return self.audio_encoder(
+            input_features=input_features,
+            is_longer=is_longer,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
+    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
+    """
+)
+class ClapTextModel(ClapPreTrainedModel):
+    config: ClapTextConfig
+
+    def __init__(self, config, add_pooling_layer=True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = ClapTextEmbeddings(config)
+        self.encoder = ClapTextEncoder(config)
+
+        self.pooler = ClapTextPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring
+class ClapModel(ClapPreTrainedModel):
+    config: ClapConfig
+
+    def __init__(self, config: ClapConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, ClapTextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type ClapTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.audio_config, ClapAudioConfig):
+            raise TypeError(
+                "config.audio_config is expected to be of type ClapAudioConfig but is of type"
+                f" {type(config.audio_config)}."
+            )
+
+        text_config = config.text_config
+        audio_config = config.audio_config
+
+        self.logit_scale_a = nn.Parameter(torch.tensor(math.log(config.logit_scale_init_value)))
+        self.logit_scale_t = nn.Parameter(torch.tensor(math.log(config.logit_scale_init_value)))
+
+        self.projection_dim = config.projection_dim
+
+        self.text_model = ClapTextModel(text_config)
+        self.text_projection = ClapProjectionLayer(text_config)
+
+        self.audio_model = ClapAudioModel(audio_config)
+        self.audio_projection = ClapProjectionLayer(audio_config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @filter_out_non_signature_kwargs()
+    @auto_docstring
+    def get_text_features(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`ClapTextModel`].
+
+        Examples:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, ClapModel
+
+        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+
+        >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
+        >>> with torch.inference_mode():
+        ...     text_features = model.get_text_features(**inputs)
+        ```"""
+        text_outputs: BaseModelOutputWithPooling = self.text_model(
+            input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids
+        )
+        text_features = self.text_projection(text_outputs.pooler_output)
+        text_features = F.normalize(text_features, dim=-1)
+
+        return text_features
+
+    @filter_out_non_signature_kwargs()
+    @auto_docstring
+    def get_audio_features(
+        self,
+        input_features: torch.Tensor,
+        is_longer: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
+            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
+            the features.
+
+        Returns:
+            audio_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by
+            applying the projection layer to the pooled output of [`ClapAudioModel`].
+
+        Examples:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, ClapModel
+
+        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
+        >>> random_audio = torch.rand((16_000))
+
+        >>> inputs = feature_extractor(random_audio, return_tensors="pt")
+        >>> with torch.inference_mode():
+        ...     audio_features = model.get_audio_features(**inputs)
+        ```"""
+        audio_outputs: BaseModelOutputWithPooling = self.audio_model(
+            input_features=input_features, is_longer=is_longer
+        )
+        audio_features = self.audio_projection(audio_outputs.pooler_output)
+        audio_features = F.normalize(audio_features, dim=-1)
+
+        return audio_features
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
+        is_longer: Optional[torch.BoolTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ClapOutput]:
+        r"""
+        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
+            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
+            the features.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+
+        Examples:
+
+        ```python
+        >>> from datasets import load_dataset
+        >>> from transformers import AutoProcessor, ClapModel
+
+        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
+        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")
+
+        >>> input_text = ["Sound of a dog", "Sound of vacuum cleaner"]
+
+        >>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True)
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
+        >>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLAP model's config for some fields (if specified) instead of those of audio & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        audio_outputs = self.audio_model(
+            input_features=input_features,
+            is_longer=is_longer,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        audio_embeds = audio_outputs[1] if not return_dict else audio_outputs.pooler_output
+        audio_embeds = self.audio_projection(audio_embeds)
+
+        text_embeds = text_outputs[1] if not return_dict else text_outputs.pooler_output
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        audio_embeds = audio_embeds / audio_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale_text = self.logit_scale_t.exp()
+        logit_scale_audio = self.logit_scale_a.exp()
+        logits_per_text = torch.matmul(text_embeds, audio_embeds.t()) * logit_scale_text
+        logits_per_audio = torch.matmul(audio_embeds, text_embeds.t()) * logit_scale_audio
+
+        loss = None
+        if return_loss:
+            caption_loss = contrastive_loss(logits_per_text)
+            audio_loss = contrastive_loss(logits_per_audio.t())
+            loss = (caption_loss + audio_loss) / 2.0
+
+        return ClapOutput(
+            loss=loss,
+            logits_per_audio=logits_per_audio,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            audio_embeds=audio_embeds,
+            text_model_output=text_outputs,
+            audio_model_output=audio_outputs,
+        )
+
+
+@auto_docstring
+class ClapTextModelWithProjection(ClapPreTrainedModel):
+    config: ClapTextConfig
+
+    def __init__(self, config: ClapTextConfig):
+        super().__init__(config)
+        self.text_model = ClapTextModel(config)
+        self.text_projection = ClapProjectionLayer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.word_embeddings = value
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ClapTextModelOutput]:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, ClapTextModelWithProjection
+
+        >>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+
+        >>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> text_embeds = outputs.text_embeds
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        pooled_output = text_outputs[1] if not return_dict else text_outputs.pooler_output
+
+        text_embeds = self.text_projection(pooled_output)
+
+        return ClapTextModelOutput(
+            text_embeds=text_embeds,
+            last_hidden_state=text_outputs.last_hidden_state,
+            hidden_states=text_outputs.hidden_states,
+            attentions=text_outputs.attentions,
+        )
+
+
+@auto_docstring
+class ClapAudioModelWithProjection(ClapPreTrainedModel):
+    config: ClapAudioConfig
+    main_input_name = "input_features"
+
+    def __init__(self, config: ClapAudioConfig):
+        super().__init__(config)
+        self.audio_model = ClapAudioModel(config)
+        self.audio_projection = ClapProjectionLayer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.audio_model.audio_encoder.patch_embed.proj
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_features: Optional[torch.FloatTensor] = None,
+        is_longer: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ClapAudioModelOutput]:
+        r"""
+        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
+            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
+            the features.
+
+        Examples:
+
+        ```python
+        >>> from datasets import load_dataset
+        >>> from transformers import ClapAudioModelWithProjection, ClapProcessor
+
+        >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
+        >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
+
+        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
+        >>> inputs = processor(audios=audio_sample, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> audio_embeds = outputs.audio_embeds
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        audio_outputs = self.audio_model(
+            input_features=input_features,
+            is_longer=is_longer,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        pooled_output = audio_outputs[1] if not return_dict else audio_outputs.pooler_output
+
+        audio_embeds = self.audio_projection(pooled_output)
+
+        return ClapAudioModelOutput(
+            audio_embeds=audio_embeds,
+            last_hidden_state=audio_outputs.last_hidden_state,
+            attentions=audio_outputs.attentions,
+            hidden_states=audio_outputs.hidden_states,
+        )
+
+
+__all__ = [
+    "ClapModel",
+    "ClapPreTrainedModel",
+    "ClapTextModel",
+    "ClapTextModelWithProjection",
+    "ClapAudioModel",
+    "ClapAudioModelWithProjection",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/processing_clap.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/processing_clap.py
new file mode 100644
index 0000000000000000000000000000000000000000..6524a87158418206b8b96a7b57f6c1b7392e56cf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/processing_clap.py
@@ -0,0 +1,75 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Audio/Text processor class for CLAP
+"""
+
+from typing import Optional, Union
+
+from ...audio_utils import AudioInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+from ...utils.deprecation import deprecate_kwarg
+
+
+logger = logging.get_logger(__name__)
+
+
+class ClapProcessor(ProcessorMixin):
+    r"""
+    Constructs a CLAP processor which wraps a CLAP feature extractor and a RoBerta tokenizer into a single processor.
+
+    [`ClapProcessor`] offers all the functionalities of [`ClapFeatureExtractor`] and [`RobertaTokenizerFast`]. See the
+    [`~ClapProcessor.__call__`] and [`~ClapProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor ([`ClapFeatureExtractor`]):
+            The audio processor is a required input.
+        tokenizer ([`RobertaTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+
+    feature_extractor_class = "ClapFeatureExtractor"
+    tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast")
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+
+    @deprecate_kwarg("audios", version="v4.59.0", new_name="audio")
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        audios: Optional[AudioInput] = None,
+        audio: Optional[AudioInput] = None,
+        **kwargs: Unpack[ProcessingKwargs],
+    ):
+        """
+        Forwards the `audio` and `sampling_rate` arguments to [`~ClapFeatureExtractor.__call__`] and the `text`
+        argument to [`~RobertaTokenizerFast.__call__`]. Please refer to the docstring of the above two methods for more
+        information.
+        """
+        # The `deprecate_kwarg` will not work if the inputs are passed as arguments, so we check
+        # again that the correct naming is used
+        if audios is not None and audio is None:
+            logger.warning(
+                "Using `audios` keyword argument is deprecated when calling ClapProcessor, instead use `audio`."
+            )
+            audio = audios
+
+        return super().__call__(text=text, audio=audio, **kwargs)
+
+
+__all__ = ["ClapProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad2d57500c44322b6c749a313f07c07e11f8f20f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_cohere import *
+    from .modeling_cohere import *
+    from .tokenization_cohere_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/configuration_cohere.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/configuration_cohere.py
new file mode 100644
index 0000000000000000000000000000000000000000..c78d1e9bf8a11c45246a5cb391769566dc69ce50
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/configuration_cohere.py
@@ -0,0 +1,217 @@
+# coding=utf-8
+# Copyright 2024 Cohere team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Cohere model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class CohereConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
+    model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`CohereModel`]
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22528):
+            Dimension of the MLP representations.
+        logit_scale (`float`, *optional*, defaults to 0.0625):
+            The scaling factor for the output logits.
+        num_hidden_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 5):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 255001):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        use_qk_norm (`bool`, *optional*, defaults to `False`):
+            Whether to use query-key normalization in the attention
+
+    ```python
+    >>> from transformers import CohereModel, CohereConfig
+
+    >>> # Initializing a Cohere model configuration
+    >>> configuration = CohereConfig()
+
+    >>> # Initializing a model from the Cohere configuration
+    >>> model = CohereModel(configuration) # doctest: +SKIP
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config # doctest: +SKIP
+    ```"""
+
+    model_type = "cohere"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=8192,
+        intermediate_size=22528,
+        logit_scale=0.0625,
+        num_hidden_layers=40,
+        num_attention_heads=64,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=5,
+        eos_token_id=255001,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        use_qk_norm=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.logit_scale = logit_scale
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.use_qk_norm = use_qk_norm
+
+        # Validate the correctness of rotary position embeddings parameters
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["CohereConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/modeling_cohere.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/modeling_cohere.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dfa0ce0be33a6f3383e76d81de4f3895ab70bea
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/modeling_cohere.py
@@ -0,0 +1,534 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/cohere/modular_cohere.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_cohere.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Cohere team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on the LLama model definition file in transformers
+
+
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_cohere import CohereConfig
+
+
+class CohereLayerNorm(nn.Module):
+    def __init__(self, hidden_size=None, eps=1e-5, bias=False):
+        """The hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dim"""
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        mean = hidden_states.mean(-1, keepdim=True)
+        variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
+        hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon)
+        hidden_states = self.weight.to(torch.float32) * hidden_states
+        return hidden_states.to(input_dtype)
+
+
+class CohereRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: CohereConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.repeat_interleave(freqs, 2, dim=-1)  # diff from Llama: we interleave() instead of cat()
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class CohereMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def rotate_half(x):
+    # Split and rotate. Note that this function is different from e.g. Llama.
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
+    return rot_x
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    dtype = q.dtype
+    q = q.float()
+    k = k.float()
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
+
+
+class CohereAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.use_qk_norm = config.use_qk_norm
+        if self.use_qk_norm:
+            # When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads
+            self.q_norm = CohereLayerNorm(
+                hidden_size=(config.num_attention_heads, self.head_dim), eps=config.layer_norm_eps
+            )
+            self.k_norm = CohereLayerNorm(
+                hidden_size=(config.num_key_value_heads, self.head_dim), eps=config.layer_norm_eps
+            )
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape)
+        key_states = self.k_proj(hidden_states).view(hidden_shape)
+        value_states = self.v_proj(hidden_states).view(hidden_shape)
+
+        if self.use_qk_norm:  # main diff from Llama
+            query_states = self.q_norm(query_states)
+            key_states = self.k_norm(key_states)
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; position_ids needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class CohereDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: CohereConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = CohereAttention(config=config, layer_idx=layer_idx)
+        self.mlp = CohereMLP(config)
+        self.input_layernorm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            past_key_values (`Cache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states_attention, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        hidden_states_mlp = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states_attention + hidden_states_mlp
+        return hidden_states
+
+
+@auto_docstring
+class CoherePreTrainedModel(PreTrainedModel):
+    config: CohereConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["CohereDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": CohereDecoderLayer,
+        "attentions": CohereAttention,
+    }
+
+
+@auto_docstring
+class CohereModel(CoherePreTrainedModel):
+    def __init__(self, config: CohereConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
+        self.rotary_emb = CohereRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = CohereModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.logit_scale = config.logit_scale
+        self.tie_word_embeddings = config.tie_word_embeddings
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >> from transformers import AutoTokenizer, CohereForCausalLM
+
+        >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
+        >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
+
+        >> prompt = "Hey, are you conscious? Can you talk to me?"
+        >> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >> # Generate
+        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        logits = logits * self.logit_scale  # main diff from Llama
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["CohereForCausalLM", "CohereModel", "CoherePreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/modular_cohere.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/modular_cohere.py
new file mode 100644
index 0000000000000000000000000000000000000000..daa12a15ed268967df7a3852424ba98944a646b1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/modular_cohere.py
@@ -0,0 +1,355 @@
+# coding=utf-8
+# Copyright 2024 Cohere team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on the LLama model definition file in transformers
+
+"""PyTorch Cohere model."""
+
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...cache_utils import Cache
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, logging
+from ...utils.deprecation import deprecate_kwarg
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaForCausalLM,
+    LlamaMLP,
+    LlamaModel,
+    LlamaRotaryEmbedding,
+    eager_attention_forward,
+)
+from .configuration_cohere import CohereConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class CohereLayerNorm(nn.Module):
+    def __init__(self, hidden_size=None, eps=1e-5, bias=False):
+        """The hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dim"""
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        mean = hidden_states.mean(-1, keepdim=True)
+        variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
+        hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon)
+        hidden_states = self.weight.to(torch.float32) * hidden_states
+        return hidden_states.to(input_dtype)
+
+
+class CohereRotaryEmbedding(LlamaRotaryEmbedding):
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.repeat_interleave(freqs, 2, dim=-1)  # diff from Llama: we interleave() instead of cat()
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    # Split and rotate. Note that this function is different from e.g. Llama.
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    rot_x = torch.stack([-x2, x1], dim=-1).flatten(-2)
+    return rot_x
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    dtype = q.dtype
+    q = q.float()
+    k = k.float()
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype)
+
+
+class CohereMLP(LlamaMLP):
+    def __init__(self, config):
+        super().__init__(config)
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+
+
+class CohereAttention(LlamaAttention):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: CohereConfig, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        self.use_qk_norm = config.use_qk_norm
+        if self.use_qk_norm:
+            # When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads
+            self.q_norm = CohereLayerNorm(
+                hidden_size=(config.num_attention_heads, self.head_dim), eps=config.layer_norm_eps
+            )
+            self.k_norm = CohereLayerNorm(
+                hidden_size=(config.num_key_value_heads, self.head_dim), eps=config.layer_norm_eps
+            )
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape)
+        key_states = self.k_proj(hidden_states).view(hidden_shape)
+        value_states = self.v_proj(hidden_states).view(hidden_shape)
+
+        if self.use_qk_norm:  # main diff from Llama
+            query_states = self.q_norm(query_states)
+            key_states = self.k_norm(key_states)
+
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; position_ids needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class CohereDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: CohereConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = CohereAttention(config=config, layer_idx=layer_idx)
+        self.mlp = CohereMLP(config)
+        self.input_layernorm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            past_key_values (`Cache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states_attention, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        hidden_states_mlp = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states_attention + hidden_states_mlp
+        return hidden_states
+
+
+class CohereModel(LlamaModel):
+    def __init__(self, config: CohereConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.rotary_emb = CohereRotaryEmbedding(config=config)
+        self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps)
+
+
+class CohereForCausalLM(LlamaForCausalLM):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = CohereModel(config)
+        self.logit_scale = config.logit_scale
+        self.tie_word_embeddings = config.tie_word_embeddings
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >> from transformers import AutoTokenizer, CohereForCausalLM
+
+        >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
+        >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
+
+        >> prompt = "Hey, are you conscious? Can you talk to me?"
+        >> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >> # Generate
+        >> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        logits = logits * self.logit_scale  # main diff from Llama
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "CohereForCausalLM",
+    "CohereModel",
+    "CoherePreTrainedModel",  # noqa: F822
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/tokenization_cohere_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/tokenization_cohere_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd240b97848033ab77e984f85d91aa63e190df0c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere/tokenization_cohere_fast.py
@@ -0,0 +1,512 @@
+# coding=utf-8
+# Copyright 2024 Cohere team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on the tokenization_llama_fast.py file in transformers
+
+import pickle
+from typing import Literal, Union
+
+from tokenizers import processors
+
+from ...tokenization_utils_base import BatchEncoding
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "tokenizer_file": {
+        "Cohere/Command-nightly": "https://huggingface.co/Cohere/Command-nightly/blob/main/tokenizer.json",
+    },
+}
+
+# fmt: off
+DEFAULT_SYSTEM_PROMPT = "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere."
+DEFAULT_RAG_PREAMBLE = """## Task and Context
+You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.
+
+## Style Guide
+Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling."""
+# fmt: on
+
+
+class CohereTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a Cohere tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This uses notably ByteFallback and NFC normalization.
+
+    ```python
+    >>> from transformers import AutoTokenizer
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
+    >>> tokenizer.encode("Hello this is a test")
+    [5, 28339, 2075, 1801, 1671, 3282]
+    ```
+
+    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
+    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
+    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
+    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
+    the model was not pretrained this way, it might yield a decrease in performance.
+
+    <Tip>
+
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
+
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`, *optional*):
+            Path to the vocabulary file.
+        merges_file (`str`, *optional*):
+            Path to the merges file.
+        tokenizer_file (`str`, *optional*):
+            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
+            contains everything needed to load the tokenizer.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
+            extra spaces.
+        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<UNK>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<BOS_TOKEN>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|END_OF_TURN_TOKEN|>"`):
+            The end of sequence token.
+        add_bos_token (`bool`, *optional*, defaults to `True`):
+            Whether or not to add an `bos_token` at the start of sequences.
+        add_eos_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an `eos_token` at the end of sequences.
+        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
+            Whether or not the default system prompt for Cohere tokenizer should be used.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not the tokenizer should automatically add a prefix space
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    padding_side = "left"
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
+    # No `max_model_input_sizes`
+
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        tokenizer_file=None,
+        clean_up_tokenization_spaces=False,
+        unk_token="<UNK>",
+        bos_token="<BOS_TOKEN>",
+        eos_token="<|END_OF_TURN_TOKEN|>",
+        add_bos_token=True,
+        add_eos_token=False,
+        use_default_system_prompt=False,
+        add_prefix_space=False,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            tokenizer_file=tokenizer_file,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            use_default_system_prompt=use_default_system_prompt,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
+        self.update_post_processor()
+        self.use_default_system_prompt = use_default_system_prompt
+        self.vocab_file = vocab_file
+        self.grounded_generation_template = kwargs.pop("grounded_generation_template", None)
+        self.tool_use_template = kwargs.pop("tool_use_template", None)
+
+        # TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly
+        # check this as they were green before.
+        pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer)
+        decoder_state = pickle.dumps(self.backend_tokenizer.decoder)
+
+        if add_prefix_space:
+            pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
+            decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true')
+        self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state)
+        self.backend_tokenizer.decoder = pickle.loads(decoder_state)
+
+        self.add_prefix_space = add_prefix_space
+
+    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+        if not (self.add_prefix_space or not is_split_into_words):
+            raise Exception(
+                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
+                " pretokenized inputs."
+            )
+
+        return super()._batch_encode_plus(*args, **kwargs)
+
+    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+
+        if not (self.add_prefix_space or not is_split_into_words):
+            raise Exception(
+                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
+                " pretokenized inputs."
+            )
+
+        return super()._encode_plus(*args, **kwargs)
+
+    def update_post_processor(self):
+        """
+        Updates the underlying post processor with the current `bos_token` and `eos_token`.
+        """
+        bos = self.bos_token
+        bos_token_id = self.bos_token_id
+        if bos is None and self.add_bos_token:
+            raise ValueError("add_bos_token = True but bos_token = None")
+
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+        if eos is None and self.add_eos_token:
+            raise ValueError("add_eos_token = True but eos_token = None")
+
+        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
+
+        special_tokens = []
+        if self.add_bos_token:
+            special_tokens.append((bos, bos_token_id))
+        if self.add_eos_token:
+            special_tokens.append((eos, eos_token_id))
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=single, pair=pair, special_tokens=special_tokens
+        )
+
+    @property
+    def add_eos_token(self):
+        return self._add_eos_token
+
+    @property
+    def add_bos_token(self):
+        return self._add_bos_token
+
+    @add_eos_token.setter
+    def add_eos_token(self, value):
+        self._add_eos_token = value
+        self.update_post_processor()
+
+    @add_bos_token.setter
+    def add_bos_token(self, value):
+        self._add_bos_token = value
+        self.update_post_processor()
+
+    def apply_tool_use_template(
+        self,
+        conversation: list[dict[str, str]],
+        tools: list[dict],
+        **kwargs,
+    ) -> Union[str, list[int]]:
+        """Create a Command-R tool-use prompt.
+
+        Once rendered, the prompt instructs the model to generate a list of actions to perform on a set of user supplied tools
+        to help carry out the user's requests.
+
+        Conceptually, this works in the same way as `apply_chat_format`, but takes an additional `tools` parameter.
+
+        Converts a chat in the form of a list of dictionaries with `"role"` and `"content"` keys and a list of available
+        tools for the model to use into a prompt string, or a list of token ids.
+        This method will use the tokenizer's `default_tool_use_template` template specified at the class level.
+        You can override the default template using the `tool_use_template` kwarg but the quality of your results may decrease.
+
+        Args:
+            conversation (list[dict[str, str]]): A list of dicts
+                with "role" and "content" keys, representing the chat history so far.
+            tools (list[Dict]): a list of tools to render into the prompt for the model to choose from.
+                See an example at the bottom of the docstring.
+                The format should be:
+                   * name (str): The name of the tool to be called. Valid names contain only the characters a-z,
+                        A-Z, 0-9, _ and must not begin with a digit.
+                   * description (str): The description of what the tool does, the model uses the description to
+                        choose when and how to call the function.
+                   * parameter_definitions (list[Dict]): The input parameters of the tool. Accepts a dictionary
+                        where the key is the name of the parameter and the value is the parameter spec.
+                        Valid parameter names contain only the characters a-z, A-Z, 0-9, _ and must not begin with a digit.
+                        Parameter specs are as follows:
+                       * description (str): The description of the parameter.
+                       * type (str): the type of the parameter - most effective for python builtin data types, such as 'str', 'bool'
+                       * required: boolean: Denotes whether the parameter is always present (required) or not. Defaults to not required.
+            add_generation_prompt (bool, *optional*): Whether to end the prompt with the token(s) that indicate
+                the start of an assistant message. This is useful when you want to generate a response from the model.
+                Note that this argument will be passed to the chat template, and so it must be supported in the
+                template for this argument to have any effect.
+            tokenize (`bool`, defaults to `True`):
+                Whether to tokenize the output. If `False`, the output will be a string.
+            padding (`bool`, defaults to `False`):
+                Whether to pad sequences to the maximum length. Has no effect if tokenize is `False`.
+            truncation (`bool`, defaults to `False`):
+                Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`.
+            max_length (`int`, *optional*):
+                Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If
+                not specified, the tokenizer's `max_length` attribute will be used as a default.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable
+                values are:
+                - `'tf'`: Return TensorFlow `tf.Tensor` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+            return_dict (`bool`, *optional*, defaults to `False`):
+                Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
+            **tokenizer_kwargs: Additional kwargs to pass to the tokenizer.
+
+        Returns:
+            `str`: A rendered prompt string.
+            or if tokenize=True:
+            `list[int]`: A list of token ids representing the tokenized chat so far, including control tokens. This
+            output is ready to pass to the model, either directly or via methods like `generate()`.
+
+        Examples:
+
+        ```python
+        >> tokenizer = CohereTokenizerFast.from_pretrained("CohereForAI/c4ai-command-r-v01")
+        >> tools = [
+            {
+                "name": "internet_search",
+                "description": "Returns a list of relevant document snippets for a textual query retrieved from the internet",
+                "parameter_definitions": {
+                    "query": {
+                        "description": "Query to search the internet with",
+                        "type": "str",
+                        "required": True
+                    }
+                }
+            },
+            {
+                "name': "directly_answer",
+                "description": "Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history",
+                "parameter_definitions": {}
+            }
+        ]
+        >> conversation = [
+            {"role": "user", "content": "Whats the biggest penguin in the world?"}
+        ]
+        >> # render the prompt, ready for user to inspect, or for input into the model:
+        >> prompt = tokenizer.apply_tool_use_template(conversation, tools=tools, tokenize=False, add_generation_prompt=True)
+        >> print(prompt)
+        <BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble
+        The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
+
+        # System Preamble
+        ## Basic Rules
+        You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.
+
+        # User Preamble
+        ## Task and Context
+        You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.
+
+        ## Style Guide
+        Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.
+
+        ## Available Tools
+        Here is a list of tools that you have available to you:
+
+        \\`\\`\\`python
+        def internet_search(query: str) -> list[Dict]:
+            \"\"\"Returns a list of relevant document snippets for a textual query retrieved from the internet
+
+            Args:
+                query (str): Query to search the internet with
+            \"\"\"
+            pass
+        \\`\\`\\`
+
+        \\`\\`\\`python
+        def directly_answer() -> list[Dict]:
+            \"\"\"Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history
+            \"\"\"
+            pass
+        \\`\\`\\`<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Whats the biggest penguin in the world?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write 'Action:' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user's last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:
+        \\`\\`\\`json
+        [
+            {
+                "tool_name": title of the tool in the specification,
+                "parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters
+            }
+        ]\\`\\`\\`<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
+        ```
+        >> inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt')
+        >> outputs = model.generate(inputs, max_new_tokens=128)
+        >> print(tokenizer.decode(outputs[0]))
+        Action: ```json
+        [
+            {
+                "tool_name": "internet_search",
+                "parameters": {
+                    "query": "biggest penguin in the world"
+                }
+            }
+        ]
+        ```
+        """
+        return self.apply_chat_template(
+            conversation,
+            chat_template="tool_use",
+            tools=tools,
+            **kwargs,
+        )
+
+    def apply_grounded_generation_template(
+        self,
+        conversation: list[dict[str, str]],
+        documents: list[dict],
+        citation_mode: Literal["fast", "accurate"] = "accurate",
+        **kwargs,
+    ) -> Union[str, list[int]]:
+        """Create a Command-R grounded generation (aka RAG) prompt.
+
+        Once rendered, the prompt instructs the model to generate a response with citations in, based on supplied documents.
+
+        Conceptually, this works in the same way as `apply_chat_format`, but takes additional `documents`
+        and parameter `citation_mode` parameters.
+
+        Converts a list of dictionaries with `"role"` and `"content"` keys and a list of
+        documents for the model to ground its response on into a prompt string, or a list of token ids.
+        This method will use the tokenizer's `grounded_generation_template` template specified at the class level.
+        You can override the default template using the `grounded_generation_template` kwarg but the quality of your results may decrease.
+
+        Args:
+            conversation (list[dict[str, str]]): A list of dicts
+                with "role" and "content" keys, representing the chat history so far.
+            documents (list[dict[str, str]): A list of dicts, representing documents or tool outputs to ground your
+                generation on. A document is a semistructured dict, with a string to string mapping. Common fields are
+                `url`, `title`, `snippet` etc but should be descriptive of the key. They will get rendered into the prompt.
+            citation_mode: either "accurate" (prompt the model to generate an answer first, then rewrite it with citation
+                spans in) or "fast", where the prompt instructs the model to generate an answer with citations in directly.
+                The former has higher quality citations, the latter requires fewer tokens to be generated.
+            add_generation_prompt (bool, *optional*): Whether to end the prompt with the token(s) that indicate
+                the start of an assistant message. This is useful when you want to generate a response from the model.
+                Note that this argument will be passed to the chat template, and so it must be supported in the
+                template for this argument to have any effect.
+            tokenize (`bool`, defaults to `True`):
+                Whether to tokenize the output. If `False`, the output will be a string.
+            padding (`bool`, defaults to `False`):
+                Whether to pad sequences to the maximum length. Has no effect if tokenize is `False`.
+            truncation (`bool`, defaults to `False`):
+                Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`.
+            max_length (`int`, *optional*):
+                Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If
+                not specified, the tokenizer's `max_length` attribute will be used as a default.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable
+                values are:
+                - `'tf'`: Return TensorFlow `tf.Tensor` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+            return_dict (`bool`, *optional*, defaults to `False`):
+                Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
+            **tokenizer_kwargs: Additional kwargs to pass to the tokenizer.
+
+        Returns:
+            `str`: A rendered prompt string.
+            or if tokenize=True:
+            `list[int]`: A list of token ids representing the tokenized chat so far, including control tokens. This
+            output is ready to pass to the model, either directly or via methods like `generate()`.
+
+        Examples:
+
+        ```python
+        >> tokenizer = CohereTokenizerFast.from_pretrained('CohereForAI/c4ai-command-r-v01')
+
+        >> # define documents:
+        >> documents = [
+            { "title": "Tall penguins", "text": "Emperor penguins are the tallest." },
+            { "title": "Penguin habitats", "text": "Emperor penguins only live in Antarctica."}
+        ]
+        >> # define a conversation:
+        >> conversation = [
+            {"role": "user", "content": "Whats the biggest penguin in the world?"}
+        ]
+        >> # render the prompt, ready for user to inspect, or for input into the model:
+        >> grounded_generation_prompt = tokenizer.apply_grounded_generation_template(conversation, documents=documents, tokenize=False, add_generation_prompt=True)
+        >> print(grounded_generation_prompt)
+        <BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble
+        The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
+
+        ## Basic Rules
+        You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.
+
+        # User Preamble
+        ## Task and Context
+        You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.
+
+        ## Style Guide
+        Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Whats the biggest penguin in the world?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><results>
+        Document: 0
+        title: Tall penguins
+        text: Emperor penguins are the tallest.
+
+        Document: 1
+        title: Penguin habitats
+        text: Emperor penguins only live in Antarctica.
+        </results><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Carefully perform the following instructions, in order, starting each with a new line.
+        Firstly, Decide which of the retrieved documents are relevant to the user's last input by writing 'Relevant Documents:' followed by comma-separated list of document numbers. If none are relevant, you should instead write 'None'.
+        Secondly, Decide which of the retrieved documents contain facts that should be cited in a good answer to the user's last input by writing 'Cited Documents:' followed a comma-separated list of document numbers. If you dont want to cite any of them, you should instead write 'None'.
+        Thirdly, Write 'Answer:' followed by a response to the user's last input in high quality natural english. Use the retrieved documents to help you. Do not insert any citations or grounding markup.
+        Finally, Write 'Grounded answer:' followed by a response to the user's last input in high quality natural english. Use the symbols <co: doc> and </co: doc> to indicate when a fact comes from a document in the search result, e.g <co: 0>my fact</co: 0> for a fact from document 0.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'''
+        ```
+        >> inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt')
+        >> outputs = model.generate(inputs, max_new_tokens=128)
+        >> print(tokenizer.decode(outputs[0]))
+        Relevant Documents: 0,1
+        Cited Documents: 0,1
+        Answer: The Emperor Penguin is the tallest or biggest penguin in the world. It is a bird that lives only in Antarctica and grows to a height of around 122 centimetres.
+        Grounded answer: The <co: 0>Emperor Penguin</co: 0> is the <co: 0>tallest</co: 0> or biggest penguin in the world. It is a bird that <co: 1>lives only in Antarctica</co: 1> and <co: 0>grows to a height of around 122 centimetres.</co: 0>
+        """
+        return self.apply_chat_template(
+            conversation,
+            chat_template="rag",
+            documents=documents,
+            citation_mode=citation_mode,
+            **kwargs,
+        )
+
+    # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = bos_token_id + token_ids_0 + eos_token_id
+
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+
+        return output
+
+
+__all__ = ["CohereTokenizerFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b20eb3c1e0a8b42669761c9c45ca292b490df27
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_cohere2_vision import *
+    from .image_processing_cohere2_vision_fast import *
+    from .modeling_cohere2_vision import *
+    from .processing_cohere2_vision import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/configuration_cohere2_vision.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/configuration_cohere2_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..acc40fcf85711961a12f9536e6f4c0c97e94f59e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/configuration_cohere2_vision.py
@@ -0,0 +1,82 @@
+# Copyright 2025 the Cohere Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+class Cohere2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Cohere2VisionForConditionalGeneration`]. It is used to instantiate an
+    Cohere2 Vision model according to the specified arguments, defining the model architecture.
+
+    [CohereLabs/command-a-vision-07-2025](https://huggingface.co/CohereLabs/command-a-vision-07-2025)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `SiglipVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Cohere2Config`):
+            The config object or dictionary of the text backbone.
+        downsample_factor (`int`, *optional*, defaults to 2):
+            The factor by which to downsample the input image.
+        image_token_id (`int`, *optional*, defaults to 255036):
+            The token ID to use as placeholder for the image input.
+        alignment_intermediate_size (`int`, *optional*, defaults to 36864):
+            The size of the intermediate layer for alignment.
+    """
+
+    model_type = "cohere2_vision"
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        downsample_factor=2,
+        image_token_id=255036,
+        alignment_intermediate_size=36864,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.downsample_factor = downsample_factor
+        self.image_token_id = image_token_id
+        self.alignment_intermediate_size = alignment_intermediate_size
+
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model")
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["siglip_vision_model"](
+                hidden_size=1152,
+                intermediate_size=3072,
+                image_size=512,
+                num_hidden_layers=27,
+                num_attention_heads=12,
+            )
+
+        self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "cohere2")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=True)
+
+        self.text_config = text_config
+
+
+__all__ = ["Cohere2VisionConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..322e98dbd0f592fd152a2f87c3c5b9849d4dee29
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py
@@ -0,0 +1,306 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/cohere2_vision/modular_cohere2_vision.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_cohere2_vision.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 the Cohere Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import lru_cache
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, ImageInput, PILImageResampling, SizeDict
+from ...processing_utils import Unpack
+from ...utils import TensorType, auto_docstring
+
+
+class Cohere2VisionFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    crop_to_patches (`bool`, *optional*, defaults to `False`):
+        Whether to crop the image to patches. Can be overridden by the `crop_to_patches` parameter in the
+        `preprocess` method.
+    min_patches (`int`, *optional*, defaults to 1):
+        The minimum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `min_patches` parameter in the `preprocess` method.
+    max_patches (`int`, *optional*, defaults to 12):
+        The maximum number of patches to be extracted from the image. Only has an effect if `crop_to_patches` is
+        set to `True`. Can be overridden by the `max_patches` parameter in the `preprocess` method.
+    """
+
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
+
+
+@lru_cache(maxsize=10)
+def get_all_supported_aspect_ratios(max_image_tiles: int) -> list[tuple[int, int]]:
+    """
+    Computes all allowed aspect ratios for a given maximum number of input tiles.
+
+    This function calculates all possible arrangements of tiles that can be formed
+    within the constraint of the maximum number of tiles. Each arrangement is
+    represented by its aspect ratio (width/height) and the corresponding tile configuration.
+
+    Args:
+        max_image_tiles (`int`):
+            The maximum number of tiles allowed.
+
+    Returns:
+        `list[tuple[int, int]]`: A list of tuples, each tuple representing a valid (width, height)
+        configuration in terms of number of tiles.
+
+    Example:
+        >>> get_all_supported_aspect_ratios(4)
+        [(1, 1), (1, 2), (1, 3), (1, 4), (2, 1), (2, 2), (3, 1), (4, 1)]
+
+    """
+    aspect_ratios = []
+    for width in range(1, max_image_tiles + 1):
+        for height in range(1, max_image_tiles + 1):
+            if width * height <= max_image_tiles:
+                aspect_ratios.append((width, height))
+    return aspect_ratios
+
+
+def get_optimal_tiled_canvas(
+    original_image_size: tuple[int, int],
+    target_tile_size: tuple[int, int],
+    min_image_tiles: int,
+    max_image_tiles: int,
+) -> tuple[int, int]:
+    possible_resolutions = get_all_supported_aspect_ratios(max_image_tiles)
+    possible_resolutions = sorted(possible_resolutions, key=lambda x: x[0] * x[1])
+    image_height, image_width = original_image_size
+    patch_size_height, patch_size_width = target_tile_size  # (height == width)
+
+    candidate_resolutions = np.array(possible_resolutions) * patch_size_height
+    original_size = np.stack([image_height, image_width])
+    required_scales = candidate_resolutions / original_size
+    required_scale = np.min(required_scales, axis=-1, keepdims=True)  # [n_resolutions, 1]
+    if np.all(required_scale < 1):
+        # We are forced to downscale, so try to minimize the amount of downscaling
+        best_grid = possible_resolutions[np.argmax(required_scale)]
+    else:
+        # Pick the resolution that required the least upscaling so that it most closely fits the image
+        required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
+        best_grid = possible_resolutions[np.argmin(required_scale)]
+    return best_grid
+
+
+@auto_docstring
+class Cohere2VisionImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"height": 512, "width": 512}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    crop_to_patches = True
+    min_patches = 1
+    max_patches = 12
+    valid_kwargs = Cohere2VisionFastImageProcessorKwargs
+    patch_size = 16
+
+    def __init__(self, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Cohere2VisionFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def crop_image_to_patches(
+        self,
+        images: "torch.Tensor",
+        min_patches: int,
+        max_patches: int,
+        use_thumbnail: bool = True,
+        patch_size: Optional[Union[tuple, int, dict]] = None,
+        interpolation: Optional["F.InterpolationMode"] = None,
+    ):
+        """
+        Crop the images to patches and return a list of cropped images.
+        The number of patches and their grid arrangement are determined by the original image size,
+        the target patch size and the minimum and maximum number of patches.
+        The aspect ratio of the patches grid is chosen to be the closest to the original image aspect ratio.
+
+        Args:
+            images (`torch.Tensor`):
+                The images to be cropped.
+            min_patches (`int`):
+                The minimum number of patches to be extracted from the image.
+            max_patches (`int`):
+                The maximum number of patches to be extracted from the image.
+            use_thumbnail (`bool`, *optional*, defaults to `True`):
+                Whether to add a thumbnail image to the list of cropped patches.
+            patch_size (`int`, `tuple[int, int]`, `dict`, *optional*):
+                The size of the output patches.
+                The format of the image data. If `None`, the format is inferred from the input image.
+
+        Returns:
+            list[`PIL.Image.Image`] or list[np.ndarray]: The list of cropped images.
+        """
+        patch_size_height, patch_size_width = patch_size.height, patch_size.width
+        original_height, original_width = images.shape[-2:]
+        # find the closest aspect ratio to the target
+        num_columns, num_rows = get_optimal_tiled_canvas(
+            (original_height, original_width), (patch_size_height, patch_size_width), min_patches, max_patches
+        )
+
+        # calculate the target width and height
+        target_width = patch_size_width * num_columns
+        target_height = patch_size_height * num_rows
+        num_blocks = num_columns * num_rows
+
+        # resize the image so that each patch is of patch_size
+        resized_image = self.resize(
+            images, SizeDict(height=target_height, width=target_width), interpolation=interpolation
+        )
+        # split the image into patches
+        processed_images = []
+        for i in range(num_blocks):
+            column = i % num_columns
+            row = i // num_columns
+            box = (
+                column * patch_size_width,
+                row * patch_size_height,
+                (column + 1) * patch_size_width,
+                (row + 1) * patch_size_height,
+            )
+            # split the image
+            patch_image = resized_image[..., box[1] : box[3], box[0] : box[2]]
+            processed_images.append(patch_image)
+
+        if use_thumbnail and len(processed_images) != 1:
+            thumbnail_img = self.resize(images, patch_size, interpolation=interpolation)
+            processed_images.append(thumbnail_img)
+
+        processed_images = torch.stack(processed_images, dim=0).transpose(0, 1).contiguous()
+
+        return processed_images
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        crop_to_patches: bool,
+        min_patches: int,
+        max_patches: int,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        if crop_to_patches:
+            grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+            processed_images_grouped = {}
+            num_patches = {}
+            for shape, stacked_images in grouped_images.items():
+                stacked_images = self.crop_image_to_patches(
+                    stacked_images,
+                    min_patches,
+                    max_patches,
+                    patch_size=size,
+                    interpolation=interpolation,
+                )
+                processed_images_grouped[shape] = stacked_images
+                num_patches[shape] = [stacked_images.shape[1]] * stacked_images.shape[0]
+            images = reorder_images(processed_images_grouped, grouped_images_index)
+            images = [image for images_list in images for image in images_list]
+            num_patches = reorder_images(num_patches, grouped_images_index)
+        else:
+            num_patches = [1] * len(images)
+
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(
+            data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors
+        )
+
+    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
+        """
+        A utility that returns number patches for a given image size.
+
+        Args:
+            height (`int`):
+                Height of the input image.
+            width (`int`):
+                Width of the input image.
+            images_kwargs (`dict`, *optional*)
+                Any kwargs to override defaults of the image processor.
+        Returns:
+            `int`: Number of patches per image.
+        """
+        min_patches = images_kwargs.get("min_patches", self.min_patches)
+        max_patches = images_kwargs.get("max_patches", self.max_patches)
+        patch_size = images_kwargs.get("patch_size", self.size)
+        crop_to_patches = images_kwargs.get("crop_to_patches", self.crop_to_patches)
+
+        num_patches = 1
+        if crop_to_patches and max_patches > 1:
+            num_columns, num_rows = get_optimal_tiled_canvas(
+                (height, width), (patch_size["height"], patch_size["width"]), min_patches, max_patches
+            )
+            if num_columns * num_rows > 1:
+                num_patches += num_columns * num_rows
+
+        return num_patches
+
+
+__all__ = ["Cohere2VisionImageProcessorFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/modeling_cohere2_vision.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/modeling_cohere2_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dc993967b5c767fe93f63fccb954b42a52a1c28
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/modeling_cohere2_vision.py
@@ -0,0 +1,425 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/cohere2_vision/modular_cohere2_vision.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_cohere2_vision.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 the Cohere Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring
+from ...utils.generic import check_model_inputs
+from ..auto import AutoModel
+from .configuration_cohere2_vision import Cohere2VisionConfig
+
+
+class Cohere2VisionMultiModalProjector(nn.Module):
+    def __init__(self, config: Cohere2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.downsample_factor = config.downsample_factor
+        self.intermediate_size = config.alignment_intermediate_size
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * (config.downsample_factor**2), self.intermediate_size, bias=True
+        )
+        self.act = nn.SiLU()
+        self.linear_2 = nn.Linear(self.intermediate_size // 2, config.text_config.hidden_size, bias=True)
+
+    def pixel_shuffle(self, image_features):  # B, S, D
+        batch_size, seq_length, feature_dim = image_features.shape
+        height = width = int(seq_length**0.5)
+        image_features = image_features.reshape(image_features.shape[0], width, height, -1)
+        channels = image_features.shape[-1]
+        image_features = image_features.reshape(
+            batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor)
+        )
+        image_features = image_features.permute(0, 2, 1, 3)
+        image_features = image_features.reshape(
+            batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1
+        )
+        image_features = image_features.permute(0, 2, 1, 3)
+        return image_features
+
+    def forward(self, image_features):
+        image_features = self.pixel_shuffle(image_features)
+        hidden_states = self.linear_1(image_features)
+
+        # Split along last dimension and apply SwiGLU
+        x, gate = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.act(gate) * x
+
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Cohere2Vision outputs, with hidden states and attentions.
+    """
+)
+class Cohere2VisionModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Cohere2Vision causal language model (or autoregressive) outputs.
+    """
+)
+class Cohere2VisionCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@auto_docstring
+class Cohere2VisionPreTrainedModel(PreTrainedModel):
+    config: Cohere2VisionConfig
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _can_compile_fullgraph = False
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": "DecoderLayer",
+        "attentions": "Attention",
+    }
+
+
+@auto_docstring(
+    custom_intro="""
+    The Cohere2Vision model which consists of a vision backbone and a language model, without a language modeling head.
+    """
+)
+class Cohere2VisionModel(Cohere2VisionPreTrainedModel):
+    _checkpoint_conversion_mapping = {}
+
+    def __init__(self, config: Cohere2VisionConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+
+        self.multi_modal_projector = Cohere2VisionMultiModalProjector(config)
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def get_image_features(self, pixel_values: torch.FloatTensor):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
+            and are of shape `(num_patches, image_length, embed_dim)`).
+        """
+
+        image_features = self.vision_tower(pixel_values, output_hidden_states=True)
+        selected_image_feature = image_features.last_hidden_state
+        image_features = self.multi_modal_projector(selected_image_feature)
+        return image_features
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, Cohere2VisionModelOutputWithPast]:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return Cohere2VisionModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The COHERE2_VISION model which consists of a vision backbone and a language model.
+    """
+)
+class Cohere2VisionForConditionalGeneration(Cohere2VisionPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: Cohere2VisionConfig):
+        super().__init__(config)
+        self.model = Cohere2VisionModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_image_features(self, pixel_values: torch.FloatTensor):
+        return self.model.get_image_features(pixel_values=pixel_values)
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Cohere2VisionCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, Cohere2VisionForConditionalGeneration
+        >>> import torch
+
+        >>> processor = AutoProcessor.from_pretrained("CohereLabs/command-a-vision-07-2025", use_fast=True)
+        >>> model = Cohere2VisionForConditionalGeneration.from_pretrained("CohereLabs/command-a-vision-07-2025", device_map="auto")
+
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {
+        ...                 "type": "image",
+        ...                 "url": "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg",
+        ...             },
+        ...             {"type": "text", "text": "what is in this image?"},
+        ...         ],
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(
+        ...     messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt",
+        ... ).to(model.device)
+
+        >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3)
+        >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+        ```"""
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            image_sizes=image_sizes,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return Cohere2VisionCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+
+__all__ = ["Cohere2VisionForConditionalGeneration", "Cohere2VisionPreTrainedModel", "Cohere2VisionModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/modular_cohere2_vision.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/modular_cohere2_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ef20305b99e6e6bd68bf02a7e67cd642e2c9ca1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/modular_cohere2_vision.py
@@ -0,0 +1,318 @@
+# coding=utf-8
+# Copyright 2025 the Cohere Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch AyaVision model."""
+
+from functools import lru_cache
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+
+from transformers.models.aya_vision.modeling_aya_vision import (
+    AyaVisionCausalLMOutputWithPast,
+    AyaVisionForConditionalGeneration,
+    AyaVisionModel,
+    AyaVisionModelOutputWithPast,
+)
+from transformers.models.got_ocr2.image_processing_got_ocr2_fast import GotOcr2ImageProcessorFast
+
+from ...cache_utils import Cache
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, logging
+from ...utils.generic import check_model_inputs
+from .configuration_cohere2_vision import Cohere2VisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class Cohere2VisionMultiModalProjector(nn.Module):
+    def __init__(self, config: Cohere2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.downsample_factor = config.downsample_factor
+        self.intermediate_size = config.alignment_intermediate_size
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * (config.downsample_factor**2), self.intermediate_size, bias=True
+        )
+        self.act = nn.SiLU()
+        self.linear_2 = nn.Linear(self.intermediate_size // 2, config.text_config.hidden_size, bias=True)
+
+    def pixel_shuffle(self, image_features):  # B, S, D
+        batch_size, seq_length, feature_dim = image_features.shape
+        height = width = int(seq_length**0.5)
+        image_features = image_features.reshape(image_features.shape[0], width, height, -1)
+        channels = image_features.shape[-1]
+        image_features = image_features.reshape(
+            batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor)
+        )
+        image_features = image_features.permute(0, 2, 1, 3)
+        image_features = image_features.reshape(
+            batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1
+        )
+        image_features = image_features.permute(0, 2, 1, 3)
+        return image_features
+
+    def forward(self, image_features):
+        image_features = self.pixel_shuffle(image_features)
+        hidden_states = self.linear_1(image_features)
+
+        # Split along last dimension and apply SwiGLU
+        x, gate = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.act(gate) * x
+
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class Cohere2VisionModelOutputWithPast(AyaVisionModelOutputWithPast):
+    pass
+
+
+class Cohere2VisionCausalLMOutputWithPast(AyaVisionCausalLMOutputWithPast):
+    pass
+
+
+class Cohere2VisionModel(AyaVisionModel):
+    _checkpoint_conversion_mapping = {}
+
+    def get_image_features(self, pixel_values: torch.FloatTensor):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (List[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
+            and are of shape `(num_patches, image_length, embed_dim)`).
+        """
+
+        image_features = self.vision_tower(pixel_values, output_hidden_states=True)
+        selected_image_feature = image_features.last_hidden_state
+        image_features = self.multi_modal_projector(selected_image_feature)
+        return image_features
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, Cohere2VisionModelOutputWithPast]:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return Cohere2VisionModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+class Cohere2VisionForConditionalGeneration(AyaVisionForConditionalGeneration):
+    _checkpoint_conversion_mapping = {}
+
+    def get_image_features(self, pixel_values: torch.FloatTensor):
+        return self.model.get_image_features(pixel_values=pixel_values)
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Cohere2VisionCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoProcessor, Cohere2VisionForConditionalGeneration
+        >>> import torch
+
+        >>> processor = AutoProcessor.from_pretrained("CohereLabs/command-a-vision-07-2025", use_fast=True)
+        >>> model = Cohere2VisionForConditionalGeneration.from_pretrained("CohereLabs/command-a-vision-07-2025", device_map="auto")
+
+        >>> messages = [
+        ...     {
+        ...         "role": "user",
+        ...         "content": [
+        ...             {
+        ...                 "type": "image",
+        ...                 "url": "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg",
+        ...             },
+        ...             {"type": "text", "text": "what is in this image?"},
+        ...         ],
+        ...     },
+        ... ]
+
+        >>> inputs = processor.apply_chat_template(
+        ...     messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt",
+        ... ).to(model.device)
+
+        >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3)
+        >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
+        ```"""
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            image_sizes=image_sizes,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return Cohere2VisionCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+
+@lru_cache(maxsize=10)
+def get_all_supported_aspect_ratios(max_image_tiles: int) -> list[tuple[int, int]]:
+    """
+    Computes all allowed aspect ratios for a given maximum number of input tiles.
+
+    This function calculates all possible arrangements of tiles that can be formed
+    within the constraint of the maximum number of tiles. Each arrangement is
+    represented by its aspect ratio (width/height) and the corresponding tile configuration.
+
+    Args:
+        max_image_tiles (`int`):
+            The maximum number of tiles allowed.
+
+    Returns:
+        `list[tuple[int, int]]`: A list of tuples, each tuple representing a valid (width, height)
+        configuration in terms of number of tiles.
+
+    Example:
+        >>> get_all_supported_aspect_ratios(4)
+        [(1, 1), (1, 2), (1, 3), (1, 4), (2, 1), (2, 2), (3, 1), (4, 1)]
+
+    """
+    aspect_ratios = []
+    for width in range(1, max_image_tiles + 1):
+        for height in range(1, max_image_tiles + 1):
+            if width * height <= max_image_tiles:
+                aspect_ratios.append((width, height))
+    return aspect_ratios
+
+
+def get_optimal_tiled_canvas(
+    original_image_size: tuple[int, int],
+    target_tile_size: tuple[int, int],
+    min_image_tiles: int,
+    max_image_tiles: int,
+) -> tuple[int, int]:
+    possible_resolutions = get_all_supported_aspect_ratios(max_image_tiles)
+    possible_resolutions = sorted(possible_resolutions, key=lambda x: x[0] * x[1])
+    image_height, image_width = original_image_size
+    patch_size_height, patch_size_width = target_tile_size  # (height == width)
+
+    candidate_resolutions = np.array(possible_resolutions) * patch_size_height
+    original_size = np.stack([image_height, image_width])
+    required_scales = candidate_resolutions / original_size
+    required_scale = np.min(required_scales, axis=-1, keepdims=True)  # [n_resolutions, 1]
+    if np.all(required_scale < 1):
+        # We are forced to downscale, so try to minimize the amount of downscaling
+        best_grid = possible_resolutions[np.argmax(required_scale)]
+    else:
+        # Pick the resolution that required the least upscaling so that it most closely fits the image
+        required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
+        best_grid = possible_resolutions[np.argmin(required_scale)]
+    return best_grid
+
+
+@auto_docstring
+class Cohere2VisionImageProcessorFast(GotOcr2ImageProcessorFast):
+    size = {"height": 512, "width": 512}
+    min_patches = 1
+    max_patches = 12
+    crop_to_patches = True
+    patch_size = 16
+
+
+__all__ = [
+    "Cohere2VisionForConditionalGeneration",
+    "Cohere2VisionPreTrainedModel",  # noqa: F822
+    "Cohere2VisionModel",
+    "Cohere2VisionImageProcessorFast",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/processing_cohere2_vision.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/processing_cohere2_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..b72e1512ead9612c9e601101326f532ed3bf0d1a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2_vision/processing_cohere2_vision.py
@@ -0,0 +1,216 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+
+
+class Cohere2VisionImagesKwargs(ImagesKwargs, total=False):
+    max_patches: Optional[int]
+
+
+class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Cohere2VisionImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding_side": "left",
+            "padding": True,
+            "return_mm_token_type_ids": False,
+        },
+    }
+
+
+class Cohere2VisionProcessor(ProcessorMixin):
+    r"""
+    Constructs a Cohere2Vision processor which wraps a [`AutoImageProcessor`] and
+    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
+    tokenizer functionalities. See the [`~Cohere2VisionProcessor.__call__`] and [`~Cohere2VisionProcessor.decode`] for more information.
+    Args:
+        image_processor ([`AutoImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        **kwargs,
+    ):
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+        self.patch_size = self.image_processor.patch_size
+        self.boi_token = tokenizer.boi_token
+        self.eoi_token = tokenizer.eoi_token
+        self.image_token = tokenizer.image_token
+        self.img_line_break_token = tokenizer.img_line_break_token
+        self.image_token_id = tokenizer.image_token_id
+
+        self.image_ids = tokenizer.convert_tokens_to_ids(
+            [
+                self.image_token,
+                self.boi_token,
+                self.eoi_token,
+                self.img_line_break_token,
+            ]
+        )
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        **kwargs: Unpack[Cohere2VisionProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text.
+        To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
+        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if text is None:
+            raise ValueError("You have to specify text.")
+        elif not isinstance(text, (list, tuple)):
+            text = [text]
+
+        output_kwargs = self._merge_kwargs(
+            Cohere2VisionProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        # Process images
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+            batch_num_patches = iter(image_inputs.pop("num_patches"))
+            processed_text = []
+            for sample in text:
+                while self.image_token in sample:
+                    num_patches = next(batch_num_patches)
+                    img_patches_per_tile = int(self.patch_size**2)
+
+                    img_string = f"{self.boi_token}"
+                    for idx in range(1, num_patches):
+                        img_string += "<placeholder>" * img_patches_per_tile + self.img_line_break_token
+                    img_string += "<placeholder>" * img_patches_per_tile + self.img_line_break_token
+                    img_string += f"{self.eoi_token}"
+
+                    sample = sample.replace(self.image_token, img_string, 1)
+                processed_text.append(sample)
+            text = [sample.replace("<placeholder>", self.image_token) for sample in processed_text]
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None)
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[np.isin(array_ids, self.image_ids)] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = Cohere2VisionProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+
+            num_image_patches = [
+                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
+                for image_size in image_sizes
+            ]
+
+            token_per_patch = int(self.patch_size**2)
+            num_image_tokens = [
+                2 + sum(token_per_patch + 1 for _ in range(num_patches)) for num_patches in num_image_patches
+            ]  # Add +2 and +1 for BOI/EOI and image break tokens
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(tokenizer_input_names) + list(image_processor_input_names)
+
+
+__all__ = ["Cohere2VisionProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9d5cc944a817b4eedb0d918aab1aba15e88804f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/configuration_colpali.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/configuration_colpali.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..269d2da2974769107e6b56b0300257f86b54e553
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/configuration_colpali.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/modeling_colpali.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/modeling_colpali.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5706329fa9d0eaa707931a9cbf2a762150a3811e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/modeling_colpali.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/modular_colpali.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/modular_colpali.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d130c76793909c4e4f55976cf1c260a02d04526
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/modular_colpali.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/processing_colpali.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/processing_colpali.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e3f2ec0260fabaa595c592f786ed7987022585b6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colpali/__pycache__/processing_colpali.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1df410a0981173307eda7266c5c3ea629586d453
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/configuration_colqwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/configuration_colqwen2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2774052a1589971f7235fbd6e9cd94b7c815df4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/configuration_colqwen2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/modeling_colqwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/modeling_colqwen2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ee0ec9df5ed4c229a90783ecfb6d84b6f089e37
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/modeling_colqwen2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/modular_colqwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/modular_colqwen2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d9c3a6ad8fb17bfedb985cc51da79c75dc0232b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/modular_colqwen2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/processing_colqwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/processing_colqwen2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc2ff757be25a7d7e9bfffdd11dd346c6a48cb79
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/colqwen2/__pycache__/processing_colqwen2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..662abc7dbc33f5665904117ae33b533396cd9344
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/configuration_conditional_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/configuration_conditional_detr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40c5e3cd1fed896d49c8aa90967f1604461b0d28
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/configuration_conditional_detr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/feature_extraction_conditional_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/feature_extraction_conditional_detr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ee9787cf49eb0e31311f38b8e05a362e39f00c4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/feature_extraction_conditional_detr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/image_processing_conditional_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/image_processing_conditional_detr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..068b40435c7c3fcf26e6f79e75770949389803c7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/image_processing_conditional_detr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/image_processing_conditional_detr_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/image_processing_conditional_detr_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b3ce642d5148a9e52a4bc04ffa169146a645234
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/image_processing_conditional_detr_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/modular_conditional_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/modular_conditional_detr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee1f309e607af4c0c1550545ec822aa1ef750a0b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/conditional_detr/__pycache__/modular_conditional_detr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b50084199fed7742cb7ccc63b7466829a77d11e1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/configuration_convnext.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/configuration_convnext.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba66fb14a2f9c133788b9f2d3d4dd232df63ead2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/configuration_convnext.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/feature_extraction_convnext.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/feature_extraction_convnext.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d77cb3025910fc7a1db56e298d4a14fc5fd4627
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/feature_extraction_convnext.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/image_processing_convnext.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/image_processing_convnext.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb98e5843061e90a3c5630914c7f41a3b9b02354
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/image_processing_convnext.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/image_processing_convnext_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/image_processing_convnext_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6f7810ac3ecac4ee22c78816bd1ae7974a9643b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/image_processing_convnext_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/modeling_convnext.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/modeling_convnext.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..529f2321642b2c4ca5aed584af46263c35e00c42
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/modeling_convnext.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/modeling_tf_convnext.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/modeling_tf_convnext.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..664014aed4f63cec3d04bc955a0e79d7c781d9c6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnext/__pycache__/modeling_tf_convnext.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd06c0dc23ae61fdaaa4d361e2bdb949718df946
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e98a4f0ba189bd275760924c021fdd9a1417fdae
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f658dc253715b5d8957b7b1c18b43883c1722693
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cpm/__pycache__/tokenization_cpm_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f0c4b7cd565db868a1234e00904e97f2efeef08
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/configuration_dac.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/configuration_dac.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33f0bb1d445b20f9ab9c5005232658d972853534
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/configuration_dac.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/feature_extraction_dac.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/feature_extraction_dac.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c429610f261518c8c184e0ad35ef8fba91bb742a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/feature_extraction_dac.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/modeling_dac.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/modeling_dac.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5343657b22912a96036076bd1b61e3f1fc9e1fa8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__pycache__/modeling_dac.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bec515dda6f1586763d86fbc3d56c2ea2e7d9f01
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/configuration_deberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/configuration_deberta.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fad387634d4219a8151cd052cfadf270f3a03622
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/configuration_deberta.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/modeling_deberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/modeling_deberta.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..759326e6b6979c77dd9e433bd2cccb199a4aa4cc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/modeling_deberta.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/modeling_tf_deberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/modeling_tf_deberta.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d33cd9ac0633580ea92bcad7ee889519deba4d7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/modeling_tf_deberta.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/tokenization_deberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/tokenization_deberta.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8df68220ce114d798e53eac94b03de956032d00
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/tokenization_deberta.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/tokenization_deberta_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/tokenization_deberta_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8201972d61e752c0ffd93c1c062f604418f20405
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta/__pycache__/tokenization_deberta_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed5162ccb6d7655517546d65458fed94b0d374f6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/configuration_deepseek_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/configuration_deepseek_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b27386c4c990189b689b0a815594b5227866d42
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/configuration_deepseek_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/modeling_deepseek_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/modeling_deepseek_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a9c0ae267281ec133b7a10fc8a33634e8617570
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/modeling_deepseek_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/modular_deepseek_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/modular_deepseek_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0491301ccc8e9cb97126890ad5700d34c9d7cff1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_v2/__pycache__/modular_deepseek_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/bort/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/bort/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b00e674b6c8bfced43aedd3fa62d835238fcfd2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/bort/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0facdba6154a63c6541edcd405f74916486b8279
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/configuration_deta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/configuration_deta.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d30748b8f8b363bfbbbc65b192c4d34402d312fd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/configuration_deta.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/image_processing_deta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/image_processing_deta.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73d92f41dcdba14af5f35f9d694fff41ee55a312
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__pycache__/image_processing_deta.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab18853681c9d57b30bff23365330b33475a3505
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/configuration_efficientformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/configuration_efficientformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a7e9c89a5eec6a0ff5c1d2fc05f0203a27c9b03
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/configuration_efficientformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/image_processing_efficientformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/image_processing_efficientformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ea81a02094813cfed89e2b343f255c17050ae0c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/image_processing_efficientformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_efficientformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_efficientformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50373dc1de6c27744f52a0e3706487d1aefd6bab
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_efficientformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_tf_efficientformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_tf_efficientformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6488b055d4fe33f08c1d1a1ac0c07e7879353f56
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_tf_efficientformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24e9b5c43bd7183878f579e4d6c4139d9794d96d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/configuration_ernie_m.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/configuration_ernie_m.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93b75021706bf9372741f3dd0843051bf5b4879e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/configuration_ernie_m.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/modeling_ernie_m.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/modeling_ernie_m.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22d5d4035f7dfc4ccae79dc6327156fa914b7fd0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/modeling_ernie_m.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/tokenization_ernie_m.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/tokenization_ernie_m.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27e7e5b7b1901f11b058501d851f3dd39c819bac
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__pycache__/tokenization_ernie_m.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ec772ff99042e51d2518304800f7c94f31be185
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/configuration_gptsan_japanese.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/configuration_gptsan_japanese.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b28084a30fc63c52a9a856ea070b068ff14e3b1b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/configuration_gptsan_japanese.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/modeling_gptsan_japanese.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/modeling_gptsan_japanese.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57423d39c0a53570e5c2e97e7e5027ee7de76a91
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/modeling_gptsan_japanese.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/tokenization_gptsan_japanese.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/tokenization_gptsan_japanese.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7baaf001c7756b563594b2ffa8ecf820dacfcc63
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/tokenization_gptsan_japanese.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..997fb71e021a364cccb9684f37fcd04d5195eec6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/collating_graphormer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/collating_graphormer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d818441b648a6b73ff0b75552a36a4471486094
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/collating_graphormer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/configuration_graphormer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/configuration_graphormer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c77156e588696aed379d44e6fe5810fc080e1c1e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/configuration_graphormer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/modeling_graphormer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/modeling_graphormer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8798159fb8c8a5dcb00e5bec17ae6b117b479f77
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__pycache__/modeling_graphormer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85f002bc8f134bdd02558d911cacf685b56c905c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/configuration_jukebox.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/configuration_jukebox.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8936afb6c0bed2c755998863252d8ba8edf6e15
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/configuration_jukebox.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/tokenization_jukebox.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/tokenization_jukebox.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0f583c0ecc79fad5498dcaaf364cd6c39473d3e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__pycache__/tokenization_jukebox.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53f493dab11b1ef493b18d75ddf84b8a933cf4d0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/configuration_mctct.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/configuration_mctct.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a6c14a1d11cded6ee77b9683ac64ddf9851dc1b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/configuration_mctct.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/feature_extraction_mctct.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/feature_extraction_mctct.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed8941496901fb69b0eccb8b8e26010ec2e60c7b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/feature_extraction_mctct.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/modeling_mctct.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/modeling_mctct.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b309ff23029e1d9365ba172230c6fa00c345c0a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/modeling_mctct.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/processing_mctct.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/processing_mctct.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d92e37e13eae72dce715e4d95811c6d667c75a7e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__pycache__/processing_mctct.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ecd98bb30237a0ac68d9aa7ffca03ebe81676a02
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__pycache__/configuration_mega.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__pycache__/configuration_mega.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e39ca858f266338346ed75f1bd3167c80df91b6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__pycache__/configuration_mega.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3efe9a62c2af8b4bc0f9be522b284e668f95d1a0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/configuration_mmbt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/configuration_mmbt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a49e89f9695c18d74cdf9ba81c9c246a87538c8b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/configuration_mmbt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/modeling_mmbt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/modeling_mmbt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..663012339337cc1ffdd8f613d216428baaca9072
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__pycache__/modeling_mmbt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04d30077506511b9abc7b31601daebe76b874534
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/configuration_nat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/configuration_nat.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67fb5d2ce23749442a3029565f1a662657d92352
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/configuration_nat.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/modeling_nat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/modeling_nat.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee3011aca424d392bcd5c0e08012eeea1f4fa5ca
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__pycache__/modeling_nat.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc9749bac04ef66c1d6d827870eb171876116c21
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/configuration_nezha.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/configuration_nezha.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c7d206f499e7c947fcdfd2576cc61185f2298dc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/configuration_nezha.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/modeling_nezha.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/modeling_nezha.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d21fbb059d8780ad2d6bfe908264badfc0892d52
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__pycache__/modeling_nezha.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d32e2ef88daa2379086aa3025f371b817d95689
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/configuration_open_llama.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/configuration_open_llama.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..505372a1f3126bab069a15f2df6f820b5113511e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/configuration_open_llama.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/modeling_open_llama.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/modeling_open_llama.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d6f2bd0d56c9cc85a2fa37d136578a94bd34411
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__pycache__/modeling_open_llama.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad05ad84f244edf0dc46b84d7353d41cc4b0b9c5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/configuration_qdqbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/configuration_qdqbert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d639585a62d2600911df909d11a7fc4d5463e41a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/configuration_qdqbert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/modeling_qdqbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/modeling_qdqbert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb559947f1e645d567910f0400c453a4249b2956
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__pycache__/modeling_qdqbert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bbd01b1d2a8d4aa5fec0be1c8c0a3a2d2a121fad
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/configuration_realm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/configuration_realm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c3d28286c72c6b83c49aa4aae19fe5bac85416f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/configuration_realm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/modeling_realm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/modeling_realm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83a1002a93b984d3397a241a4d33697a1e631f16
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/modeling_realm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec068e6350e977f731ca352b584ff86e93abeb62
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5cf611b71d3e333ef4ed226dc8febf7537d5f738
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12b18b1be7225caf0fc028d23aa8379f32709a5a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/configuration_retribert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/configuration_retribert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca490e414aa5e3c6f9141c6c634543a73eaae001
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/configuration_retribert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/modeling_retribert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/modeling_retribert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07a9692c891e55aa62c68dd0f84cd5863308d989
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/modeling_retribert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0e98c6ed1348cbf4d5024e9d79f3ae2a3761b55
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6104b101cbb7e3f9f9b129bc2fc48296877e0f8b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e65c5409aeb7b4702b794bfc80ad28099948dd36
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/configuration_speech_to_text_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/configuration_speech_to_text_2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f46b323f6debbdc8199e00a63cd060fd1653b184
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/configuration_speech_to_text_2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/modeling_speech_to_text_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/modeling_speech_to_text_2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0fb0cfed949f1613f124f73add749f07d18309b4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/modeling_speech_to_text_2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/processing_speech_to_text_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/processing_speech_to_text_2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..181bdafbadd4e47282728c1b1a14f6ad454b83e5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/processing_speech_to_text_2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/tokenization_speech_to_text_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/tokenization_speech_to_text_2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d71ec4bb8573a1ae37d8dd1be174e1cec7416f4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/tokenization_speech_to_text_2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..275528a54427b5575a0f455a3558ba92e11ca027
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__pycache__/tokenization_tapex.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__pycache__/tokenization_tapex.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35e5505a14e5c096c9a585a89d80d88f2b71a8e2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__pycache__/tokenization_tapex.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1b969fca47c5e08a2f3f3c5f5a41d9e64fd577a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/configuration_trajectory_transformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/configuration_trajectory_transformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c61587acbc74c55dd8cbb26104f4c0e4f1efad8f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/configuration_trajectory_transformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/modeling_trajectory_transformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/modeling_trajectory_transformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a4e88106d6317e7a359c9f67c2b03b29499fbcb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/modeling_trajectory_transformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84f0f6a14dc631ce080ac6f2d36f84dbe4686503
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/configuration_transfo_xl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/configuration_transfo_xl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb9ea6aa3292c0693b3ad753a4e3c482f6297642
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/configuration_transfo_xl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c54dd6bce9331efad73dadbbf8325fc8ccdf98f3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl_utilities.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl_utilities.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3b73e39bedb436c6eeaaad645ed3614908def5b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl_utilities.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e3df676e90e47b24dc1067fd19e424be260d0835
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl_utilities.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl_utilities.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec868cb35462973dab88650ce9d56b7dbc85ae22
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl_utilities.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/tokenization_transfo_xl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/tokenization_transfo_xl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c13b073a9c4f32f589a3bc94ac0a20086ffd0854
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/tokenization_transfo_xl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76f40143ba9dfbbd6b311c907b3a0fcbb8a55892
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/configuration_tvlt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/configuration_tvlt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5ee3036f7ccd339675c9f131226c871e983d076
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/configuration_tvlt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/feature_extraction_tvlt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/feature_extraction_tvlt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20e9a6e54913fd471f6c05ffa180d12dd43b3299
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/feature_extraction_tvlt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/image_processing_tvlt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/image_processing_tvlt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a766af57b76487fe84e4fdf7ff4dcc5dab6b5f04
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/image_processing_tvlt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/modeling_tvlt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/modeling_tvlt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0eb9b8732929c0f1ddcea743296272ff3d733ce8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/modeling_tvlt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/processing_tvlt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/processing_tvlt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1082209e23e7110e9a45c80a80ffe4ff3da543f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__pycache__/processing_tvlt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51206db9eb1a3aa8ca01aa7c5d645d2e10a94e93
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/configuration_van.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/configuration_van.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4acfd5284e937d297cc6537cdc63a89b5a6fa6d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/configuration_van.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/modeling_van.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/modeling_van.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8bee80fd9e664f74c0631e74c7924263ae37c8a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__pycache__/modeling_van.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32a64173e4d529cd6fa975a878343540ae94ec5a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/configuration_vit_hybrid.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/configuration_vit_hybrid.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d6caf685e78df6a2580229e7a0b7d4acb26dd7c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/configuration_vit_hybrid.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/image_processing_vit_hybrid.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/image_processing_vit_hybrid.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e27efac01e38f50ea377d33d8028d10eca679cd0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/image_processing_vit_hybrid.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/modeling_vit_hybrid.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/modeling_vit_hybrid.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf4f92fd20d225ecd05171e112e02c3691971e45
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/modeling_vit_hybrid.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8094c60c893d1a8d262cca889dc6b2de0bd1ac6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/configuration_xlm_prophetnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/configuration_xlm_prophetnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5f8da49cb85bb12423a85c85479b5bc654b1a9e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/configuration_xlm_prophetnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/tokenization_xlm_prophetnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/tokenization_xlm_prophetnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77c15953ba03c2e220a4a1978fd5ae8afcb74d85
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/tokenization_xlm_prophetnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b8aae5e70381b21772bdec395693c007a9c02b7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_detr import *
+    from .feature_extraction_detr import *
+    from .image_processing_detr import *
+    from .image_processing_detr_fast import *
+    from .modeling_detr import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/feature_extraction_detr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/feature_extraction_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..a81f83c8c313bdb8a904f0b359360c0e100a83d9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/feature_extraction_detr.py
@@ -0,0 +1,48 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for DETR."""
+
+import warnings
+
+from ...image_transforms import rgb_to_id as _rgb_to_id
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_detr import DetrImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+def rgb_to_id(x):
+    warnings.warn(
+        "rgb_to_id has moved and will not be importable from this module from v5. "
+        "Please import from transformers.image_transforms instead.",
+        FutureWarning,
+    )
+    return _rgb_to_id(x)
+
+
+@requires(backends=("vision",))
+class DetrFeatureExtractor(DetrImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class DetrFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
+            " Please use DetrImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["DetrFeatureExtractor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/image_processing_detr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/image_processing_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..f29bd48a5934cca8224a91d0cc25ce54f79e169f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/image_processing_detr.py
@@ -0,0 +1,2049 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for DETR."""
+
+import io
+import pathlib
+from collections import defaultdict
+from collections.abc import Iterable
+from typing import Any, Callable, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    PaddingMode,
+    center_to_corners_format,
+    corners_to_center_format,
+    id_to_rgb,
+    pad,
+    rescale,
+    resize,
+    rgb_to_id,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_annotations,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    TensorType,
+    is_flax_available,
+    is_jax_tensor,
+    is_scipy_available,
+    is_tf_available,
+    is_tf_tensor,
+    is_torch_available,
+    is_torch_tensor,
+    is_vision_available,
+    logging,
+)
+from ...utils.import_utils import requires
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+
+if is_vision_available():
+    import PIL
+
+
+if is_scipy_available():
+    import scipy.special
+    import scipy.stats
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+# From the original repo: https://github.com/facebookresearch/detr/blob/3af9fa878e73b6894ce3596450a8d9b89d918ca9/datasets/transforms.py#L76
+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size.
+
+    Args:
+        image_size (`tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+    """
+    height, width = image_size
+    raw_size = None
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
+
+    if (height <= width and height == size) or (width <= height and width == size):
+        oh, ow = height, width
+    elif width < height:
+        ow = size
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
+    else:
+        oh = size
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
+    return (oh, ow)
+
+
+def get_image_size_for_max_height_width(
+    input_image: np.ndarray,
+    max_height: int,
+    max_width: int,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
+    Important, even if image_height < max_height and image_width < max_width, the image will be resized
+    to at least one of the edges be equal to max_height or max_width.
+
+    For example:
+        - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
+        - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        max_height (`int`):
+            The maximum allowed height.
+        max_width (`int`):
+            The maximum allowed width.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    height, width = image_size
+    height_scale = max_height / height
+    width_scale = max_width / width
+    min_scale = min(height_scale, width_scale)
+    new_height = int(height * min_scale)
+    new_width = int(width * min_scale)
+    return new_height, new_width
+
+
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, tuple[int, int], list[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size. If the desired output size
+    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
+    image size is computed by keeping the aspect ratio of the input image size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `tuple[int, int]` or `list[int]`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    if isinstance(size, (list, tuple)):
+        return size
+
+    return get_size_with_aspect_ratio(image_size, size, max_size)
+
+
+def get_numpy_to_framework_fn(arr) -> Callable:
+    """
+    Returns a function that converts a numpy array to the framework of the input array.
+
+    Args:
+        arr (`np.ndarray`): The array to convert.
+    """
+    if isinstance(arr, np.ndarray):
+        return np.array
+    if is_tf_available() and is_tf_tensor(arr):
+        import tensorflow as tf
+
+        return tf.convert_to_tensor
+    if is_torch_available() and is_torch_tensor(arr):
+        import torch
+
+        return torch.tensor
+    if is_flax_available() and is_jax_tensor(arr):
+        import jax.numpy as jnp
+
+        return jnp.array
+    raise ValueError(f"Cannot convert arrays of type {type(arr)}")
+
+
+def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
+    """
+    Squeezes an array, but only if the axis specified has dim 1.
+    """
+    if axis is None:
+        return arr.squeeze()
+
+    try:
+        return arr.squeeze(axis=axis)
+    except ValueError:
+        return arr
+
+
+def normalize_annotation(annotation: dict, image_size: tuple[int, int]) -> dict:
+    image_height, image_width = image_size
+    norm_annotation = {}
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            boxes = corners_to_center_format(boxes)
+            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
+            norm_annotation[key] = boxes
+        else:
+            norm_annotation[key] = value
+    return norm_annotation
+
+
+# Copied from transformers.models.vilt.image_processing_vilt.max_across_indices
+def max_across_indices(values: Iterable[Any]) -> list[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+# Copied from transformers.models.vilt.image_processing_vilt.get_max_height_width
+def get_max_height_width(
+    images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> list[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
+
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+
+
+# Copied from transformers.models.vilt.image_processing_vilt.make_pixel_mask
+def make_pixel_mask(
+    image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L33
+def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
+    """
+    Convert a COCO polygon annotation to a mask.
+
+    Args:
+        segmentations (`list[list[float]]`):
+            List of polygons, each polygon represented by a list of x-y coordinates.
+        height (`int`):
+            Height of the mask.
+        width (`int`):
+            Width of the mask.
+    """
+    try:
+        from pycocotools import mask as coco_mask
+    except ImportError:
+        raise ImportError("Pycocotools is not installed in your environment.")
+
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = np.asarray(mask, dtype=np.uint8)
+        mask = np.any(mask, axis=2)
+        masks.append(mask)
+    if masks:
+        masks = np.stack(masks, axis=0)
+    else:
+        masks = np.zeros((0, height, width), dtype=np.uint8)
+
+    return masks
+
+
+# inspired by https://github.com/facebookresearch/detr/blob/master/datasets/coco.py#L50
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by DETR.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+
+    image_id = target["image_id"]
+    image_id = np.asarray([image_id], dtype=np.int64)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+    classes = [obj["category_id"] for obj in annotations]
+    classes = np.asarray(classes, dtype=np.int64)
+
+    # for conversion to coco api
+    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
+    iscrowd = np.asarray([obj.get("iscrowd", 0) for obj in annotations], dtype=np.int64)
+
+    boxes = [obj["bbox"] for obj in annotations]
+    # guard against no boxes via resizing
+    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {}
+    new_target["image_id"] = image_id
+    new_target["class_labels"] = classes[keep]
+    new_target["boxes"] = boxes[keep]
+    new_target["area"] = area[keep]
+    new_target["iscrowd"] = iscrowd[keep]
+    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
+
+    if annotations and "keypoints" in annotations[0]:
+        keypoints = [obj["keypoints"] for obj in annotations]
+        # Converting the filtered keypoints list to a numpy array
+        keypoints = np.asarray(keypoints, dtype=np.float32)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    if return_segmentation_masks:
+        segmentation_masks = [obj["segmentation"] for obj in annotations]
+        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
+        new_target["masks"] = masks[keep]
+
+    return new_target
+
+
+def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    Args:
+        masks: masks in format `[number_masks, height, width]` where N is the number of masks
+
+    Returns:
+        boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+    """
+    if masks.size == 0:
+        return np.zeros((0, 4))
+
+    h, w = masks.shape[-2:]
+    y = np.arange(0, h, dtype=np.float32)
+    x = np.arange(0, w, dtype=np.float32)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = np.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * np.expand_dims(x, axis=0)
+    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
+    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
+    x_min = x.filled(fill_value=1e8)
+    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
+
+    y_mask = masks * np.expand_dims(y, axis=0)
+    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
+    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
+    y_min = y.filled(fill_value=1e8)
+    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
+
+    return np.stack([x_min, y_min, x_max, y_max], 1)
+
+
+def prepare_coco_panoptic_annotation(
+    image: np.ndarray,
+    target: dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
+) -> dict:
+    """
+    Prepare a coco panoptic annotation for DETR.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+    annotation_path = pathlib.Path(masks_path) / target["file_name"]
+
+    new_target = {}
+    new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
+    new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
+    new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
+
+    if "segments_info" in target:
+        masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
+        masks = rgb_to_id(masks)
+
+        ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
+        masks = masks == ids[:, None, None]
+        masks = masks.astype(np.uint8)
+        if return_masks:
+            new_target["masks"] = masks
+        new_target["boxes"] = masks_to_boxes(masks)
+        new_target["class_labels"] = np.array(
+            [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["iscrowd"] = np.asarray(
+            [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["area"] = np.asarray(
+            [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
+        )
+
+    return new_target
+
+
+def get_segmentation_image(
+    masks: np.ndarray, input_size: tuple, target_size: tuple, stuff_equiv_classes, deduplicate=False
+):
+    h, w = input_size
+    final_h, final_w = target_size
+
+    m_id = scipy.special.softmax(masks.transpose(0, 1), -1)
+
+    if m_id.shape[-1] == 0:
+        # We didn't detect any mask :(
+        m_id = np.zeros((h, w), dtype=np.int64)
+    else:
+        m_id = m_id.argmax(-1).reshape(h, w)
+
+    if deduplicate:
+        # Merge the masks corresponding to the same stuff class
+        for equiv in stuff_equiv_classes.values():
+            for eq_id in equiv:
+                m_id[m_id == eq_id] = equiv[0]
+
+    seg_img = id_to_rgb(m_id)
+    seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST)
+    return seg_img
+
+
+def get_mask_area(seg_img: np.ndarray, target_size: tuple[int, int], n_classes: int) -> np.ndarray:
+    final_h, final_w = target_size
+    np_seg_img = seg_img.astype(np.uint8)
+    np_seg_img = np_seg_img.reshape(final_h, final_w, 3)
+    m_id = rgb_to_id(np_seg_img)
+    area = [(m_id == i).sum() for i in range(n_classes)]
+    return area
+
+
+def score_labels_from_class_probabilities(logits: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    probs = scipy.special.softmax(logits, axis=-1)
+    labels = probs.argmax(-1, keepdims=True)
+    scores = np.take_along_axis(probs, labels, axis=-1)
+    scores, labels = scores.squeeze(-1), labels.squeeze(-1)
+    return scores, labels
+
+
+def post_process_panoptic_sample(
+    out_logits: np.ndarray,
+    masks: np.ndarray,
+    boxes: np.ndarray,
+    processed_size: tuple[int, int],
+    target_size: tuple[int, int],
+    is_thing_map: dict,
+    threshold=0.85,
+) -> dict:
+    """
+    Converts the output of [`DetrForSegmentation`] into panoptic segmentation predictions for a single sample.
+
+    Args:
+        out_logits (`torch.Tensor`):
+            The logits for this sample.
+        masks (`torch.Tensor`):
+            The predicted segmentation masks for this sample.
+        boxes (`torch.Tensor`):
+            The predicted bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y,
+            width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding).
+        processed_size (`tuple[int, int]`):
+            The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size
+            after data augmentation but before batching.
+        target_size (`tuple[int, int]`):
+            The target size of the image, `(height, width)` corresponding to the requested final size of the
+            prediction.
+        is_thing_map (`Dict`):
+            A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not.
+        threshold (`float`, *optional*, defaults to 0.85):
+            The threshold used to binarize the segmentation masks.
+    """
+    # we filter empty queries and detection below threshold
+    scores, labels = score_labels_from_class_probabilities(out_logits)
+    keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold)
+
+    cur_scores = scores[keep]
+    cur_classes = labels[keep]
+    cur_boxes = center_to_corners_format(boxes[keep])
+
+    if len(cur_boxes) != len(cur_classes):
+        raise ValueError("Not as many boxes as there are classes")
+
+    cur_masks = masks[keep]
+    cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR)
+    cur_masks = safe_squeeze(cur_masks, 1)
+    b, h, w = cur_masks.shape
+
+    # It may be that we have several predicted masks for the same stuff class.
+    # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+    cur_masks = cur_masks.reshape(b, -1)
+    stuff_equiv_classes = defaultdict(list)
+    for k, label in enumerate(cur_classes):
+        if not is_thing_map[label]:
+            stuff_equiv_classes[label].append(k)
+
+    seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True)
+    area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores))
+
+    # We filter out any mask that is too small
+    if cur_classes.size() > 0:
+        # We know filter empty masks as long as we find some
+        filtered_small = np.array([a <= 4 for a in area], dtype=bool)
+        while filtered_small.any():
+            cur_masks = cur_masks[~filtered_small]
+            cur_scores = cur_scores[~filtered_small]
+            cur_classes = cur_classes[~filtered_small]
+            seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True)
+            area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores))
+            filtered_small = np.array([a <= 4 for a in area], dtype=bool)
+    else:
+        cur_classes = np.ones((1, 1), dtype=np.int64)
+
+    segments_info = [
+        {"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a}
+        for i, (cat, a) in enumerate(zip(cur_classes, area))
+    ]
+    del cur_classes
+
+    with io.BytesIO() as out:
+        PIL.Image.fromarray(seg_img).save(out, format="PNG")
+        predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+
+    return predictions
+
+
+def resize_annotation(
+    annotation: dict[str, Any],
+    orig_size: tuple[int, int],
+    target_size: tuple[int, int],
+    threshold: float = 0.5,
+    resample: PILImageResampling = PILImageResampling.NEAREST,
+):
+    """
+    Resizes an annotation to a target size.
+
+    Args:
+        annotation (`dict[str, Any]`):
+            The annotation dictionary.
+        orig_size (`tuple[int, int]`):
+            The original size of the input image.
+        target_size (`tuple[int, int]`):
+            The target size of the image, as returned by the preprocessing `resize` step.
+        threshold (`float`, *optional*, defaults to 0.5):
+            The threshold used to binarize the segmentation masks.
+        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
+            The resampling filter to use when resizing the masks.
+    """
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
+    ratio_height, ratio_width = ratios
+
+    new_annotation = {}
+    new_annotation["size"] = target_size
+
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            new_annotation["boxes"] = scaled_boxes
+        elif key == "area":
+            area = value
+            scaled_area = area * (ratio_width * ratio_height)
+            new_annotation["area"] = scaled_area
+        elif key == "masks":
+            masks = value[:, None]
+            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
+            masks = masks.astype(np.float32)
+            masks = masks[:, 0] > threshold
+            new_annotation["masks"] = masks
+        elif key == "size":
+            new_annotation["size"] = target_size
+        else:
+            new_annotation[key] = value
+
+    return new_annotation
+
+
+# TODO - (Amy) make compatible with other frameworks
+def binary_mask_to_rle(mask):
+    """
+    Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format.
+
+    Args:
+        mask (`torch.Tensor` or `numpy.array`):
+            A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target
+            segment_id or class_id.
+    Returns:
+        `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE
+        format.
+    """
+    if is_torch_tensor(mask):
+        mask = mask.numpy()
+
+    pixels = mask.flatten()
+    pixels = np.concatenate([[0], pixels, [0]])
+    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
+    runs[1::2] -= runs[::2]
+    return list(runs)
+
+
+# TODO - (Amy) make compatible with other frameworks
+def convert_segmentation_to_rle(segmentation):
+    """
+    Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format.
+
+    Args:
+        segmentation (`torch.Tensor` or `numpy.array`):
+            A segmentation map of shape `(height, width)` where each value denotes a segment or class id.
+    Returns:
+        `list[List]`: A list of lists, where each list is the run-length encoding of a segment / class id.
+    """
+    segment_ids = torch.unique(segmentation)
+
+    run_length_encodings = []
+    for idx in segment_ids:
+        mask = torch.where(segmentation == idx, 1, 0)
+        rle = binary_mask_to_rle(mask)
+        run_length_encodings.append(rle)
+
+    return run_length_encodings
+
+
+def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels):
+    """
+    Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and
+    `labels`.
+
+    Args:
+        masks (`torch.Tensor`):
+            A tensor of shape `(num_queries, height, width)`.
+        scores (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        labels (`torch.Tensor`):
+            A tensor of shape `(num_queries)`.
+        object_mask_threshold (`float`):
+            A number between 0 and 1 used to binarize the masks.
+    Raises:
+        `ValueError`: Raised when the first dimension doesn't match in all input tensors.
+    Returns:
+        `tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region
+        < `object_mask_threshold`.
+    """
+    if not (masks.shape[0] == scores.shape[0] == labels.shape[0]):
+        raise ValueError("mask, scores and labels must have the same shape!")
+
+    to_keep = labels.ne(num_labels) & (scores > object_mask_threshold)
+
+    return masks[to_keep], scores[to_keep], labels[to_keep]
+
+
+def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8):
+    # Get the mask associated with the k class
+    mask_k = mask_labels == k
+    mask_k_area = mask_k.sum()
+
+    # Compute the area of all the stuff in query k
+    original_area = (mask_probs[k] >= mask_threshold).sum()
+    mask_exists = mask_k_area > 0 and original_area > 0
+
+    # Eliminate disconnected tiny segments
+    if mask_exists:
+        area_ratio = mask_k_area / original_area
+        if not area_ratio.item() > overlap_mask_area_threshold:
+            mask_exists = False
+
+    return mask_exists, mask_k
+
+
+def compute_segments(
+    mask_probs,
+    pred_scores,
+    pred_labels,
+    mask_threshold: float = 0.5,
+    overlap_mask_area_threshold: float = 0.8,
+    label_ids_to_fuse: Optional[set[int]] = None,
+    target_size: Optional[tuple[int, int]] = None,
+):
+    height = mask_probs.shape[1] if target_size is None else target_size[0]
+    width = mask_probs.shape[2] if target_size is None else target_size[1]
+
+    segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device)
+    segments: list[dict] = []
+
+    if target_size is not None:
+        mask_probs = nn.functional.interpolate(
+            mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False
+        )[0]
+
+    current_segment_id = 0
+
+    # Weigh each mask by its prediction score
+    mask_probs *= pred_scores.view(-1, 1, 1)
+    mask_labels = mask_probs.argmax(0)  # [height, width]
+
+    # Keep track of instances of each class
+    stuff_memory_list: dict[str, int] = {}
+    for k in range(pred_labels.shape[0]):
+        pred_class = pred_labels[k].item()
+        should_fuse = pred_class in label_ids_to_fuse
+
+        # Check if mask exists and large enough to be a segment
+        mask_exists, mask_k = check_segment_validity(
+            mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold
+        )
+
+        if mask_exists:
+            if pred_class in stuff_memory_list:
+                current_segment_id = stuff_memory_list[pred_class]
+            else:
+                current_segment_id += 1
+
+            # Add current object segment to final segmentation map
+            segmentation[mask_k] = current_segment_id
+            segment_score = round(pred_scores[k].item(), 6)
+            segments.append(
+                {
+                    "id": current_segment_id,
+                    "label_id": pred_class,
+                    "was_fused": should_fuse,
+                    "score": segment_score,
+                }
+            )
+            if should_fuse:
+                stuff_memory_list[pred_class] = current_segment_id
+
+    return segmentation, segments
+
+
+@requires(backends=("vision",))
+class DetrImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Detr image processor.
+
+    Args:
+        format (`str`, *optional*, defaults to `"coco_detection"`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be
+            overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+            Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
+            in the `preprocess` method. Available options are:
+                - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                    Do NOT keep the aspect ratio.
+                - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                    the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                    less or equal to `longest_edge`.
+                - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                    aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                    `max_width`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to True):
+            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+            `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_annotations: Optional[bool] = None,
+        do_pad: bool = True,
+        pad_size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None if size is None else 1333
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+
+        # Backwards compatibility
+        if do_convert_annotations is None:
+            do_convert_annotations = do_normalize
+
+        super().__init__(**kwargs)
+        self.format = format
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.do_convert_annotations = do_convert_annotations
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_pad = do_pad
+        self.pad_size = pad_size
+        self._valid_processor_keys = [
+            "images",
+            "annotations",
+            "return_segmentation_masks",
+            "masks_path",
+            "do_resize",
+            "size",
+            "resample",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "do_convert_annotations",
+            "image_mean",
+            "image_std",
+            "do_pad",
+            "pad_size",
+            "format",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `DetrImageProcessor.from_pretrained(checkpoint, size=600,
+        max_size=800)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "max_size" in kwargs:
+            image_processor_dict["max_size"] = kwargs.pop("max_size")
+        if "pad_and_return_pixel_mask" in kwargs:
+            image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    def prepare_annotation(
+        self,
+        image: np.ndarray,
+        target: dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> dict:
+        """
+        Prepare an annotation for feeding into DETR model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        elif format == AnnotationFormat.COCO_PANOPTIC:
+            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_panoptic_annotation(
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` parameter is deprecated and will be removed in v4.26. "
+                "Please specify in `size['longest_edge'] instead`.",
+            )
+            max_size = kwargs.pop("max_size")
+        else:
+            max_size = None
+        size = get_size_dict(size, max_size=max_size, default_to_square=False)
+        if "shortest_edge" in size and "longest_edge" in size:
+            new_size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
+        elif "max_height" in size and "max_width" in size:
+            new_size = get_image_size_for_max_height_width(
+                image, size["max_height"], size["max_width"], input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            new_size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+        image = resize(
+            image,
+            size=new_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+        return image
+
+    def resize_annotation(
+        self,
+        annotation,
+        orig_size,
+        size,
+        resample: PILImageResampling = PILImageResampling.NEAREST,
+    ) -> dict:
+        """
+        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
+        to this number.
+        """
+        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
+
+    # TODO (Amy) - update to use `rescale_factor` instead of `scale`
+    def rescale(
+        self,
+        image: np.ndarray,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+    def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict:
+        """
+        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
+        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
+        """
+        return normalize_annotation(annotation, image_size=image_size)
+
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: dict,
+        input_image_size: tuple[int, int],
+        output_image_size: tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = pad(
+                    masks,
+                    padding,
+                    mode=PaddingMode.CONSTANT,
+                    constant_values=0,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= np.asarray(
+                    [
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                    ]
+                )
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: tuple[int, int],
+        annotation: Optional[dict[str, Any]] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        if annotation is not None:
+            annotation = self._update_annotation_for_padded_image(
+                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
+            )
+        return padded_image, annotation
+
+    def pad(
+        self,
+        images: list[np.ndarray],
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
+        pad_size: Optional[dict[str, int]] = None,
+    ) -> BatchFeature:
+        """
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
+
+        Args:
+            images (list[`np.ndarray`]):
+                Images to pad.
+            annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+                Annotations to transform according to the padding that is applied to the images.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+            update_bboxes (`bool`, *optional*, defaults to `True`):
+                Whether to update the bounding boxes in the annotations to match the padded images. If the
+                bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
+                format, the bounding boxes will not be updated.
+            pad_size (`dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        pad_size = pad_size if pad_size is not None else self.pad_size
+        if pad_size is not None:
+            padded_size = (pad_size["height"], pad_size["width"])
+        else:
+            padded_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        annotation_list = annotations if annotations is not None else [None] * len(images)
+        padded_images = []
+        padded_annotations = []
+        for image, annotation in zip(images, annotation_list):
+            padded_image, padded_annotation = self._pad_image(
+                image,
+                padded_size,
+                annotation,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                update_bboxes=update_bboxes,
+            )
+            padded_images.append(padded_image)
+            padded_annotations.append(padded_annotation)
+
+        data = {"pixel_values": padded_images}
+
+        if return_pixel_mask:
+            masks = [
+                make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
+                for image in images
+            ]
+            data["pixel_mask"] = masks
+
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
+            ]
+
+        return encoded_inputs
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample=None,  # PILImageResampling
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        do_convert_annotations: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_pad: Optional[bool] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        pad_size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
+                detection, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
+                  dictionary. An image can have no annotations, in which case the list should be empty.
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                  An image can have no segments, in which case the list should be empty.
+                - "file_name" (`str`): The file name of the image.
+            return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+                Whether to return segmentation masks.
+            masks_path (`str` or `pathlib.Path`, *optional*):
+                Path to the directory containing the segmentation masks.
+            do_resize (`bool`, *optional*, defaults to self.do_resize):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to self.size):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling`, *optional*, defaults to self.resample):
+                Resampling filter to use when resizing the image.
+            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+                Rescale factor to use when rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+                Whether to normalize the image.
+            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
+                Whether to convert the annotations to the format expected by the model. Converts the bounding
+                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
+                and in relative coordinates.
+            image_mean (`float` or `list[float]`, *optional*, defaults to self.image_mean):
+                Mean to use when normalizing the image.
+            image_std (`float` or `list[float]`, *optional*, defaults to self.image_std):
+                Standard deviation to use when normalizing the image.
+            do_pad (`bool`, *optional*, defaults to self.do_pad):
+                Whether to pad the image. If `True`, padding will be applied to the bottom and right of
+                the image with zeros. If `pad_size` is provided, the image will be padded to the specified
+                dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
+                Format of the annotations.
+            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+                Type of tensors to return. If `None`, will return the list of images.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            pad_size (`dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        if "pad_and_return_pixel_mask" in kwargs:
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "use `do_pad` instead."
+            )
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        if "max_size" in kwargs:
+            logger.warning_once(
+                "The `max_size` argument is deprecated and will be removed in a future version, use"
+                " `size['longest_edge']` instead."
+            )
+            size = kwargs.pop("max_size")
+
+        do_resize = self.do_resize if do_resize is None else do_resize
+        size = self.size if size is None else size
+        size = get_size_dict(size=size, default_to_square=False)
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+        do_convert_annotations = (
+            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
+        )
+        do_pad = self.do_pad if do_pad is None else do_pad
+        pad_size = self.pad_size if pad_size is None else pad_size
+        format = self.format if format is None else format
+
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated.
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        if (
+            masks_path is not None
+            and format == AnnotationFormat.COCO_PANOPTIC
+            and not isinstance(masks_path, (pathlib.Path, str))
+        ):
+            raise ValueError(
+                "The path to the directory containing the mask PNG files should be provided as a"
+                f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+            )
+
+        # All transformations expect numpy arrays
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+        if annotations is not None:
+            prepared_images = []
+            prepared_annotations = []
+            for image, target in zip(images, annotations):
+                target = self.prepare_annotation(
+                    image,
+                    target,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
+                )
+                prepared_images.append(image)
+                prepared_annotations.append(target)
+            images = prepared_images
+            annotations = prepared_annotations
+            del prepared_images, prepared_annotations
+
+        # transformations
+        if do_resize:
+            if annotations is not None:
+                resized_images, resized_annotations = [], []
+                for image, target in zip(images, annotations):
+                    orig_size = get_image_size(image, input_data_format)
+                    resized_image = self.resize(
+                        image, size=size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
+                    resized_images.append(resized_image)
+                    resized_annotations.append(resized_annotation)
+                images = resized_images
+                annotations = resized_annotations
+                del resized_images, resized_annotations
+            else:
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+
+        if do_rescale:
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+        if do_normalize:
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_convert_annotations and annotations is not None:
+            annotations = [
+                self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+                for annotation, image in zip(annotations, images)
+            ]
+
+        if do_pad:
+            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+            encoded_inputs = self.pad(
+                images,
+                annotations=annotations,
+                return_pixel_mask=True,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                update_bboxes=do_convert_annotations,
+                return_tensors=return_tensors,
+                pad_size=pad_size,
+            )
+        else:
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
+            encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+            if annotations is not None:
+                encoded_inputs["labels"] = [
+                    BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+                ]
+
+        return encoded_inputs
+
+    # POSTPROCESSING METHODS - TODO: add support for other frameworks
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation). For visualization, this should be the image size
+                after data augment, but before padding.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+        )
+
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if len(out_logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        prob = nn.functional.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+
+        # convert to [x0, y0, x1, y1] format
+        boxes = center_to_corners_format(out_bbox)
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+        return results
+
+    def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
+        """
+        Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrSegmentationOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
+            threshold (`float`, *optional*, defaults to 0.9):
+                Threshold to use to filter out queries.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an image
+            in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process_segmentation` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_semantic_segmentation`.",
+        )
+        out_logits, raw_masks = outputs.logits, outputs.pred_masks
+        empty_label = out_logits.shape[-1] - 1
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.tolist())
+
+        for cur_logits, cur_masks, size in zip(out_logits, raw_masks, target_sizes):
+            # we filter empty queries and detection below threshold
+            cur_scores, cur_labels = cur_logits.softmax(-1).max(-1)
+            keep = cur_labels.ne(empty_label) & (cur_scores > threshold)
+            cur_scores = cur_scores[keep]
+            cur_labels = cur_labels[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_masks = (cur_masks.sigmoid() > mask_threshold) * 1
+
+            predictions = {"scores": cur_scores, "labels": cur_labels, "masks": cur_masks}
+            preds.append(predictions)
+        return preds
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
+    def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
+        """
+        Converts the output of [`DetrForSegmentation`] into actual instance segmentation predictions. Only supports
+        PyTorch.
+
+        Args:
+            results (`list[Dict]`):
+                Results list obtained by [`~DetrImageProcessor.post_process`], to which "masks" results will be added.
+            outputs ([`DetrSegmentationOutput`]):
+                Raw outputs of the model.
+            orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation).
+            max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the
+                original image size (before any data augmentation).
+            threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks for an
+            image in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process_instance` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_instance_segmentation`.",
+        )
+
+        if len(orig_target_sizes) != len(max_target_sizes):
+            raise ValueError("Make sure to pass in as many orig_target_sizes as max_target_sizes")
+        max_h, max_w = max_target_sizes.max(0)[0].tolist()
+        outputs_masks = outputs.pred_masks.squeeze(2)
+        outputs_masks = nn.functional.interpolate(
+            outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
+        )
+        outputs_masks = (outputs_masks.sigmoid() > threshold).cpu()
+
+        for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
+            img_h, img_w = t[0], t[1]
+            results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
+            results[i]["masks"] = nn.functional.interpolate(
+                results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest"
+            ).byte()
+
+        return results
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
+    def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
+        """
+        Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrSegmentationOutput`]):
+                Raw outputs of the model.
+            processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`):
+                Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
+                augmentation but before batching.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `list[Tuple]` of length `batch_size`, *optional*):
+                Torch Tensor (or list) corresponding to the requested final size `(height, width)` of each prediction.
+                If left to None, it will default to the `processed_sizes`.
+            is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
+                Dictionary mapping class indices to either True or False, depending on whether or not they are a thing.
+                If not set, defaults to the `is_thing_map` of COCO panoptic.
+            threshold (`float`, *optional*, defaults to 0.85):
+                Threshold to use to filter out queries.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values for
+            an image in the batch as predicted by the model.
+        """
+        logger.warning_once(
+            "`post_process_panoptic is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_panoptic_segmentation`.",
+        )
+        if target_sizes is None:
+            target_sizes = processed_sizes
+        if len(processed_sizes) != len(target_sizes):
+            raise ValueError("Make sure to pass in as many processed_sizes as target_sizes")
+
+        if is_thing_map is None:
+            # default to is_thing_map of COCO panoptic
+            is_thing_map = {i: i <= 90 for i in range(201)}
+
+        out_logits, raw_masks, raw_boxes = outputs.logits, outputs.pred_masks, outputs.pred_boxes
+        if not len(out_logits) == len(raw_masks) == len(target_sizes):
+            raise ValueError(
+                "Make sure that you pass in as many target sizes as the batch dimension of the logits and masks"
+            )
+        empty_label = out_logits.shape[-1] - 1
+        preds = []
+
+        def to_tuple(tup):
+            if isinstance(tup, tuple):
+                return tup
+            return tuple(tup.tolist())
+
+        for cur_logits, cur_masks, cur_boxes, size, target_size in zip(
+            out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes
+        ):
+            # we filter empty queries and detection below threshold
+            cur_scores, cur_labels = cur_logits.softmax(-1).max(-1)
+            keep = cur_labels.ne(empty_label) & (cur_scores > threshold)
+            cur_scores = cur_scores[keep]
+            cur_labels = cur_labels[keep]
+            cur_masks = cur_masks[keep]
+            cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_boxes = center_to_corners_format(cur_boxes[keep])
+
+            h, w = cur_masks.shape[-2:]
+            if len(cur_boxes) != len(cur_labels):
+                raise ValueError("Not as many boxes as there are classes")
+
+            # It may be that we have several predicted masks for the same stuff class.
+            # In the following, we track the list of masks ids for each stuff class (they are merged later on)
+            cur_masks = cur_masks.flatten(1)
+            stuff_equiv_classes = defaultdict(lambda: [])
+            for k, label in enumerate(cur_labels):
+                if not is_thing_map[label.item()]:
+                    stuff_equiv_classes[label.item()].append(k)
+
+            def get_ids_area(masks, scores, dedup=False):
+                # This helper function creates the final panoptic segmentation image
+                # It also returns the area of the masks that appears on the image
+
+                m_id = masks.transpose(0, 1).softmax(-1)
+
+                if m_id.shape[-1] == 0:
+                    # We didn't detect any mask :(
+                    m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
+                else:
+                    m_id = m_id.argmax(-1).view(h, w)
+
+                if dedup:
+                    # Merge the masks corresponding to the same stuff class
+                    for equiv in stuff_equiv_classes.values():
+                        if len(equiv) > 1:
+                            for eq_id in equiv:
+                                m_id.masked_fill_(m_id.eq(eq_id), equiv[0])
+
+                final_h, final_w = to_tuple(target_size)
+
+                seg_img = PIL.Image.fromarray(id_to_rgb(m_id.view(h, w).cpu().numpy()))
+                seg_img = seg_img.resize(size=(final_w, final_h), resample=PILImageResampling.NEAREST)
+
+                np_seg_img = torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes()))
+                np_seg_img = np_seg_img.view(final_h, final_w, 3)
+                np_seg_img = np_seg_img.numpy()
+
+                m_id = torch.from_numpy(rgb_to_id(np_seg_img))
+
+                area = []
+                for i in range(len(scores)):
+                    area.append(m_id.eq(i).sum().item())
+                return area, seg_img
+
+            area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True)
+            if cur_labels.numel() > 0:
+                # We know filter empty masks as long as we find some
+                while True:
+                    filtered_small = torch.as_tensor(
+                        [area[i] <= 4 for i, c in enumerate(cur_labels)], dtype=torch.bool, device=keep.device
+                    )
+                    if filtered_small.any().item():
+                        cur_scores = cur_scores[~filtered_small]
+                        cur_labels = cur_labels[~filtered_small]
+                        cur_masks = cur_masks[~filtered_small]
+                        area, seg_img = get_ids_area(cur_masks, cur_scores)
+                    else:
+                        break
+
+            else:
+                cur_labels = torch.ones(1, dtype=torch.long, device=cur_labels.device)
+
+            segments_info = []
+            for i, a in enumerate(area):
+                cat = cur_labels[i].item()
+                segments_info.append({"id": i, "isthing": is_thing_map[cat], "category_id": cat, "area": a})
+            del cur_labels
+
+            with io.BytesIO() as out:
+                seg_img.save(out, format="PNG")
+                predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+            preds.append(predictions)
+        return preds
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258
+    def post_process_object_detection(
+        self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, list[tuple]] = None
+    ):
+        """
+        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        prob = nn.functional.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+
+        # Convert to [x0, y0, x1, y1] format
+        boxes = center_to_corners_format(out_bbox)
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, list):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for s, l, b in zip(scores, labels, boxes):
+            score = s[s > threshold]
+            label = l[s > threshold]
+            box = b[s > threshold]
+            results.append({"scores": score, "labels": label, "boxes": box})
+
+        return results
+
+    def post_process_semantic_segmentation(self, outputs, target_sizes: Optional[list[tuple[int, int]]] = None):
+        """
+        Converts the output of [`DetrForSegmentation`] into semantic segmentation maps. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrForSegmentation`]):
+                Raw outputs of the model.
+            target_sizes (`list[tuple[int, int]]`, *optional*):
+                A list of tuples (`tuple[int, int]`) containing the target size (height, width) of each image in the
+                batch. If unset, predictions will not be resized.
+        Returns:
+            `list[torch.Tensor]`:
+                A list of length `batch_size`, where each item is a semantic segmentation map of shape (height, width)
+                corresponding to the target_sizes entry (if `target_sizes` is specified). Each entry of each
+                `torch.Tensor` correspond to a semantic class id.
+        """
+        class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.pred_masks  # [batch_size, num_queries, height, width]
+
+        # Remove the null class `[..., :-1]`
+        masks_classes = class_queries_logits.softmax(dim=-1)[..., :-1]
+        masks_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+
+        # Semantic segmentation logits of shape (batch_size, num_classes, height, width)
+        segmentation = torch.einsum("bqc, bqhw -> bchw", masks_classes, masks_probs)
+        batch_size = class_queries_logits.shape[0]
+
+        # Resize logits and compute semantic segmentation maps
+        if target_sizes is not None:
+            if batch_size != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+            semantic_segmentation = []
+            for idx in range(batch_size):
+                resized_logits = nn.functional.interpolate(
+                    segmentation[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
+                )
+                semantic_map = resized_logits[0].argmax(dim=0)
+                semantic_segmentation.append(semantic_map)
+        else:
+            semantic_segmentation = segmentation.argmax(dim=1)
+            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
+
+        return semantic_segmentation
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
+    def post_process_instance_segmentation(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        mask_threshold: float = 0.5,
+        overlap_mask_area_threshold: float = 0.8,
+        target_sizes: Optional[list[tuple[int, int]]] = None,
+        return_coco_annotation: Optional[bool] = False,
+    ) -> list[dict]:
+        """
+        Converts the output of [`DetrForSegmentation`] into instance segmentation predictions. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrForSegmentation`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The probability score threshold to keep predicted instance masks.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
+                The overlap mask area threshold to merge or discard small disconnected parts within each binary
+                instance mask.
+            target_sizes (`list[Tuple]`, *optional*):
+                List of length (batch_size), where each list item (`tuple[int, int]]`) corresponds to the requested
+                final size (height, width) of each prediction. If unset, predictions will not be resized.
+            return_coco_annotation (`bool`, *optional*):
+                Defaults to `False`. If set to `True`, segmentation maps are returned in COCO run-length encoding (RLE)
+                format.
+        Returns:
+            `list[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
+            - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
+              `list[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to
+              `True`. Set to `None` if no mask if found above `threshold`.
+            - **segments_info** -- A dictionary that contains additional information on each segment.
+                - **id** -- An integer representing the `segment_id`.
+                - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
+                - **score** -- Prediction score of segment with `segment_id`.
+        """
+        class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.pred_masks  # [batch_size, num_queries, height, width]
+
+        batch_size = class_queries_logits.shape[0]
+        num_labels = class_queries_logits.shape[-1] - 1
+
+        mask_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+
+        # Predicted label and score of each query (batch_size, num_queries)
+        pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1)
+
+        # Loop over items in batch size
+        results: list[dict[str, TensorType]] = []
+
+        for i in range(batch_size):
+            mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
+                mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels
+            )
+
+            # No mask found
+            if mask_probs_item.shape[0] <= 0:
+                height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:]
+                segmentation = torch.zeros((height, width)) - 1
+                results.append({"segmentation": segmentation, "segments_info": []})
+                continue
+
+            # Get segmentation map and segment information of batch item
+            target_size = target_sizes[i] if target_sizes is not None else None
+            segmentation, segments = compute_segments(
+                mask_probs=mask_probs_item,
+                pred_scores=pred_scores_item,
+                pred_labels=pred_labels_item,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                label_ids_to_fuse=[],
+                target_size=target_size,
+            )
+
+            # Return segmentation map in run-length encoding (RLE) format
+            if return_coco_annotation:
+                segmentation = convert_segmentation_to_rle(segmentation)
+
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
+
+    # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
+    def post_process_panoptic_segmentation(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        mask_threshold: float = 0.5,
+        overlap_mask_area_threshold: float = 0.8,
+        label_ids_to_fuse: Optional[set[int]] = None,
+        target_sizes: Optional[list[tuple[int, int]]] = None,
+    ) -> list[dict]:
+        """
+        Converts the output of [`DetrForSegmentation`] into image panoptic segmentation predictions. Only supports
+        PyTorch.
+
+        Args:
+            outputs ([`DetrForSegmentation`]):
+                The outputs from [`DetrForSegmentation`].
+            threshold (`float`, *optional*, defaults to 0.5):
+                The probability score threshold to keep predicted instance masks.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.8):
+                The overlap mask area threshold to merge or discard small disconnected parts within each binary
+                instance mask.
+            label_ids_to_fuse (`Set[int]`, *optional*):
+                The labels in this state will have all their instances be fused together. For instance we could say
+                there can only be one sky in an image, but several persons, so the label ID for sky would be in that
+                set, but not the one for person.
+            target_sizes (`list[Tuple]`, *optional*):
+                List of length (batch_size), where each list item (`tuple[int, int]]`) corresponds to the requested
+                final size (height, width) of each prediction in batch. If unset, predictions will not be resized.
+        Returns:
+            `list[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
+            - **segmentation** -- a tensor of shape `(height, width)` where each pixel represents a `segment_id` or
+              `None` if no mask if found above `threshold`. If `target_sizes` is specified, segmentation is resized to
+              the corresponding `target_sizes` entry.
+            - **segments_info** -- A dictionary that contains additional information on each segment.
+                - **id** -- an integer representing the `segment_id`.
+                - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
+                - **was_fused** -- a boolean, `True` if `label_id` was in `label_ids_to_fuse`, `False` otherwise.
+                  Multiple instances of the same class / label were fused and assigned a single `segment_id`.
+                - **score** -- Prediction score of segment with `segment_id`.
+        """
+
+        if label_ids_to_fuse is None:
+            logger.warning_once("`label_ids_to_fuse` unset. No instance will be fused.")
+            label_ids_to_fuse = set()
+
+        class_queries_logits = outputs.logits  # [batch_size, num_queries, num_classes+1]
+        masks_queries_logits = outputs.pred_masks  # [batch_size, num_queries, height, width]
+
+        batch_size = class_queries_logits.shape[0]
+        num_labels = class_queries_logits.shape[-1] - 1
+
+        mask_probs = masks_queries_logits.sigmoid()  # [batch_size, num_queries, height, width]
+
+        # Predicted label and score of each query (batch_size, num_queries)
+        pred_scores, pred_labels = nn.functional.softmax(class_queries_logits, dim=-1).max(-1)
+
+        # Loop over items in batch size
+        results: list[dict[str, TensorType]] = []
+
+        for i in range(batch_size):
+            mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(
+                mask_probs[i], pred_scores[i], pred_labels[i], threshold, num_labels
+            )
+
+            # No mask found
+            if mask_probs_item.shape[0] <= 0:
+                height, width = target_sizes[i] if target_sizes is not None else mask_probs_item.shape[1:]
+                segmentation = torch.zeros((height, width)) - 1
+                results.append({"segmentation": segmentation, "segments_info": []})
+                continue
+
+            # Get segmentation map and segment information of batch item
+            target_size = target_sizes[i] if target_sizes is not None else None
+            segmentation, segments = compute_segments(
+                mask_probs=mask_probs_item,
+                pred_scores=pred_scores_item,
+                pred_labels=pred_labels_item,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                label_ids_to_fuse=label_ids_to_fuse,
+                target_size=target_size,
+            )
+
+            results.append({"segmentation": segmentation, "segments_info": segments})
+        return results
+
+
+__all__ = ["DetrImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/modeling_detr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/modeling_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..86835ca62cfc02f7e2f8f134c9824c3eda1f31e5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/modeling_detr.py
@@ -0,0 +1,1693 @@
+# coding=utf-8
+# Copyright 2021 Facebook AI Research The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DETR model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    auto_docstring,
+    is_timm_available,
+    logging,
+    requires_backends,
+)
+from ...utils.backbone_utils import load_backbone
+from .configuration_detr import DetrConfig
+
+
+if is_timm_available():
+    from timm import create_model
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the DETR decoder. This class adds one attribute to BaseModelOutputWithCrossAttentions,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+    """
+)
+class DetrDecoderOutput(BaseModelOutputWithCrossAttentions):
+    r"""
+    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+        used to compute the weighted average in the cross-attention heads.
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, num_queries, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+        Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+        layernorm.
+    """
+
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the DETR encoder-decoder model. This class adds one attribute to Seq2SeqModelOutput,
+    namely an optional stack of intermediate decoder activations, i.e. the output of each decoder layer, each of them
+    gone through a layernorm. This is useful when training the model with auxiliary decoding losses.
+    """
+)
+class DetrModelOutput(Seq2SeqModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(config.decoder_layers, batch_size, sequence_length, hidden_size)`, *optional*, returned when `config.auxiliary_loss=True`):
+        Intermediate decoder activations, i.e. the output of each decoder layer, each of them gone through a
+        layernorm.
+    """
+
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`DetrForObjectDetection`].
+    """
+)
+class DetrObjectDetectionOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+        scale-invariant IoU loss.
+    loss_dict (`Dict`, *optional*):
+        A dictionary containing the individual losses. Useful for logging.
+    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+        Classification logits (including no-object) for all queries.
+    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+        possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
+        unnormalized bounding boxes.
+    auxiliary_outputs (`list[Dict]`, *optional*):
+        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+        `pred_boxes`) for each decoder layer.
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[dict] = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    auxiliary_outputs: Optional[list[dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`DetrForSegmentation`].
+    """
+)
+class DetrSegmentationOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+        scale-invariant IoU loss.
+    loss_dict (`Dict`, *optional*):
+        A dictionary containing the individual losses. Useful for logging.
+    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+        Classification logits (including no-object) for all queries.
+    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+        possible padding). You can use [`~DetrImageProcessor.post_process_object_detection`] to retrieve the
+        unnormalized bounding boxes.
+    pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height/4, width/4)`):
+        Segmentation masks logits for all queries. See also
+        [`~DetrImageProcessor.post_process_semantic_segmentation`] or
+        [`~DetrImageProcessor.post_process_instance_segmentation`]
+        [`~DetrImageProcessor.post_process_panoptic_segmentation`] to evaluate semantic, instance and panoptic
+        segmentation masks respectively.
+    auxiliary_outputs (`list[Dict]`, *optional*):
+        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+        `pred_boxes`) for each decoder layer.
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[dict] = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    pred_masks: Optional[torch.FloatTensor] = None
+    auxiliary_outputs: Optional[list[dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+# BELOW: utilities copied from
+# https://github.com/facebookresearch/detr/blob/master/backbone.py
+class DetrFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `DetrFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = DetrFrozenBatchNorm2d(module.num_features)
+
+            if module.weight.device != torch.device("meta"):
+                new_module.weight.data.copy_(module.weight)
+                new_module.bias.data.copy_(module.bias)
+                new_module.running_mean.data.copy_(module.running_mean)
+                new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+class DetrConvEncoder(nn.Module):
+    """
+    Convolutional backbone, using either the AutoBackbone API or one from the timm library.
+
+    nn.BatchNorm2d layers are replaced by DetrFrozenBatchNorm2d as defined above.
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+
+        # For backwards compatibility we have to use the timm library directly instead of the AutoBackbone API
+        if config.use_timm_backbone:
+            # We default to values which were previously hard-coded. This enables configurability from the config
+            # using backbone arguments, while keeping the default behavior the same.
+            requires_backends(self, ["timm"])
+            kwargs = getattr(config, "backbone_kwargs", {})
+            kwargs = {} if kwargs is None else kwargs.copy()
+            out_indices = kwargs.pop("out_indices", (1, 2, 3, 4))
+            num_channels = kwargs.pop("in_chans", config.num_channels)
+            if config.dilation:
+                kwargs["output_stride"] = kwargs.get("output_stride", 16)
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                out_indices=out_indices,
+                in_chans=num_channels,
+                **kwargs,
+            )
+        else:
+            backbone = load_backbone(config)
+
+        # replace batch norm by frozen batch norm
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
+
+        backbone_model_type = None
+        if config.backbone is not None:
+            backbone_model_type = config.backbone
+        elif config.backbone_config is not None:
+            backbone_model_type = config.backbone_config.model_type
+        else:
+            raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
+
+        if "resnet" in backbone_model_type:
+            for name, parameter in self.model.named_parameters():
+                if config.use_timm_backbone:
+                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                        parameter.requires_grad_(False)
+                else:
+                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
+                        parameter.requires_grad_(False)
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
+
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+
+
+class DetrConvModel(nn.Module):
+    """
+    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
+    """
+
+    def __init__(self, conv_encoder, position_embedding):
+        super().__init__()
+        self.conv_encoder = conv_encoder
+        self.position_embedding = position_embedding
+
+    def forward(self, pixel_values, pixel_mask):
+        # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples
+        out = self.conv_encoder(pixel_values, pixel_mask)
+        pos = []
+        for feature_map, mask in out:
+            # position encoding
+            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
+
+        return out, pos
+
+
+class DetrSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, pixel_values, pixel_mask):
+        if pixel_mask is None:
+            raise ValueError("No pixel mask provided")
+        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            y_embed = y_embed / (y_embed[:, -1:, :] + 1e-6) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + 1e-6) * self.scale
+
+        dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float()
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+class DetrLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, embedding_dim=256):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+    def forward(self, pixel_values, pixel_mask=None):
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+def build_position_encoding(config):
+    n_steps = config.d_model // 2
+    if config.position_embedding_type == "sine":
+        # TODO find a better way of exposing other arguments
+        position_embedding = DetrSinePositionEmbedding(n_steps, normalize=True)
+    elif config.position_embedding_type == "learned":
+        position_embedding = DetrLearnedPositionEmbedding(n_steps)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+
+    return position_embedding
+
+
+class DetrAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, object_queries: Optional[Tensor]):
+        return tensor if object_queries is None else tensor + object_queries
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        object_queries: Optional[torch.Tensor] = None,
+        key_value_states: Optional[torch.Tensor] = None,
+        spatial_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size, target_len, embed_dim = hidden_states.size()
+
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if object_queries is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, object_queries)
+
+        # add key-value position embeddings to the key value states
+        if spatial_position_embeddings is not None:
+            key_value_states_original = key_value_states
+            key_value_states = self.with_pos_embed(key_value_states, spatial_position_embeddings)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(key_value_states_original), -1, batch_size)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
+            value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
+
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        source_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, target_len, source_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+                    f" {attention_mask.size()}"
+                )
+            if attention_mask.dtype == torch.bool:
+                attention_mask = torch.zeros_like(attention_mask, dtype=attn_weights.dtype).masked_fill_(
+                    attention_mask, -torch.inf
+                )
+            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+class DetrEncoderLayer(nn.Module):
+    def __init__(self, config: DetrConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = DetrAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        object_queries: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            object_queries (`torch.FloatTensor`, *optional*):
+                Object queries (also called content embeddings), to be added to the hidden states.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            object_queries=object_queries,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class DetrDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DetrConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = DetrAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = DetrAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        object_queries: Optional[torch.Tensor] = None,
+        query_position_embeddings: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            object_queries (`torch.FloatTensor`, *optional*):
+                object_queries that are added to the hidden states
+            in the cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor`, *optional*):
+                position embeddings that are added to the queries and keys
+            in the self-attention layer.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            object_queries=query_position_embeddings,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                object_queries=query_position_embeddings,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                spatial_position_embeddings=object_queries,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+@auto_docstring
+class DetrPreTrainedModel(PreTrainedModel):
+    config: DetrConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+    _no_split_modules = [r"DetrConvEncoder", r"DetrEncoderLayer", r"DetrDecoderLayer"]
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        xavier_std = self.config.init_xavier_std
+
+        if isinstance(module, DetrMHAttentionMap):
+            nn.init.zeros_(module.k_linear.bias)
+            nn.init.zeros_(module.q_linear.bias)
+            nn.init.xavier_uniform_(module.k_linear.weight, gain=xavier_std)
+            nn.init.xavier_uniform_(module.q_linear.weight, gain=xavier_std)
+        elif isinstance(module, DetrLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class DetrEncoder(DetrPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`DetrEncoderLayer`].
+
+    The encoder updates the flattened feature map through multiple self-attention layers.
+
+    Small tweak for DETR:
+
+    - object_queries are added to the forward pass.
+
+    Args:
+        config: DetrConfig
+    """
+
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        self.layers = nn.ModuleList([DetrEncoderLayer(config) for _ in range(config.encoder_layers)])
+
+        # in the original DETR, no layernorm is used at the end of the encoder, as "normalize_before" is set to False by default
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        object_queries=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+
+                [What are attention masks?](../glossary#attention-mask)
+
+            object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Object queries that are added to the queries in each self-attention layer.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                # we add object_queries as extra input to the encoder_layer
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    object_queries=object_queries,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class DetrDecoder(DetrPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetrDecoderLayer`].
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some small tweaks for DETR:
+
+    - object_queries and query_position_embeddings are added to the forward pass.
+    - if self.config.auxiliary_loss is set to True, also returns a stack of activations from all decoding layers.
+
+    Args:
+        config: DetrConfig
+    """
+
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+
+        self.layers = nn.ModuleList([DetrDecoderLayer(config) for _ in range(config.decoder_layers)])
+        # in DETR, the decoder uses layernorm after the last decoder layer output
+        self.layernorm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        object_queries=None,
+        query_position_embeddings=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on certain queries. Mask values selected in `[0, 1]`:
+
+                - 1 for queries that are **not masked**,
+                - 0 for queries that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+
+            object_queries (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Object queries that are added to the queries and keys in each cross-attention layer.
+            query_position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                , *optional*): Position embeddings that are added to the values and keys in each self-attention layer.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+            input_shape = inputs_embeds.size()[:-1]
+
+        combined_attention_mask = None
+
+        if attention_mask is not None and combined_attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            combined_attention_mask = combined_attention_mask + _prepare_4d_attention_mask(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # optional intermediate hidden states
+        intermediate = () if self.config.auxiliary_loss else None
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                combined_attention_mask,
+                object_queries,
+                query_position_embeddings,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if self.config.auxiliary_loss:
+                hidden_states = self.layernorm(hidden_states)
+                intermediate += (hidden_states,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # finally, apply layernorm
+        hidden_states = self.layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        # stack intermediate decoder activations
+        if self.config.auxiliary_loss:
+            intermediate = torch.stack(intermediate)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions, intermediate]
+                if v is not None
+            )
+        return DetrDecoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+            intermediate_hidden_states=intermediate,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The bare DETR Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
+    any specific head on top.
+    """
+)
+class DetrModel(DetrPreTrainedModel):
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+
+        # Create backbone + positional encoding
+        backbone = DetrConvEncoder(config)
+        object_queries = build_position_encoding(config)
+        self.backbone = DetrConvModel(backbone, object_queries)
+
+        # Create projection layer
+        self.input_projection = nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1)
+
+        self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
+
+        self.encoder = DetrEncoder(config)
+        self.decoder = DetrDecoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], DetrModelOutput]:
+        r"""
+        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, DetrModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
+        >>> model = DetrModel.from_pretrained("facebook/detr-resnet-50")
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> # forward pass
+        >>> outputs = model(**inputs)
+
+        >>> # the last hidden states are the final query embeddings of the Transformer decoder
+        >>> # these are of shape (batch_size, num_queries, hidden_size)
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 100, 256]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), device=device)
+
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # pixel_values should be of shape (batch_size, num_channels, height, width)
+        # pixel_mask should be of shape (batch_size, height, width)
+        features, object_queries_list = self.backbone(pixel_values, pixel_mask)
+
+        # get final feature map and downsampled mask
+        feature_map, mask = features[-1]
+
+        if mask is None:
+            raise ValueError("Backbone does not return downsampled pixel mask")
+
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        projected_feature_map = self.input_projection(feature_map)
+
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
+
+        flattened_mask = mask.flatten(1)
+
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, height*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                object_queries=object_queries,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, sent query embeddings + object_queries through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.query_position_embeddings.weight.unsqueeze(0).repeat(batch_size, 1, 1)
+        queries = torch.zeros_like(query_position_embeddings)
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            object_queries=object_queries,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return DetrModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+        )
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+class DetrMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+@auto_docstring(
+    custom_intro="""
+    DETR Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
+    such as COCO detection.
+    """
+)
+class DetrForObjectDetection(DetrPreTrainedModel):
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+
+        # DETR encoder-decoder model
+        self.model = DetrModel(config)
+
+        # Object detection heads
+        self.class_labels_classifier = nn.Linear(
+            config.d_model, config.num_labels + 1
+        )  # We add one for the "no object" class
+        self.bbox_predictor = DetrMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[list[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], DetrObjectDetectionOutput]:
+        r"""
+        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, DetrForObjectDetection
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")
+        >>> model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
+        ...     0
+        ... ]
+
+        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(
+        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
+        ...         f"{round(score.item(), 3)} at location {box}"
+        ...     )
+        Detected remote with confidence 0.998 at location [40.16, 70.81, 175.55, 117.98]
+        Detected remote with confidence 0.996 at location [333.24, 72.55, 368.33, 187.66]
+        Detected couch with confidence 0.995 at location [-0.02, 1.15, 639.73, 473.76]
+        Detected cat with confidence 0.999 at location [13.24, 52.05, 314.02, 470.93]
+        Detected cat with confidence 0.999 at location [345.4, 23.85, 640.37, 368.72]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # First, sent images through DETR base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        # class logits + predicted bounding boxes
+        logits = self.class_labels_classifier(sequence_output)
+        pred_boxes = self.bbox_predictor(sequence_output).sigmoid()
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            outputs_class, outputs_coord = None, None
+            if self.config.auxiliary_loss:
+                intermediate = outputs.intermediate_hidden_states if return_dict else outputs[4]
+                outputs_class = self.class_labels_classifier(intermediate)
+                outputs_coord = self.bbox_predictor(intermediate).sigmoid()
+            loss, loss_dict, auxiliary_outputs = self.loss_function(
+                logits, labels, self.device, pred_boxes, self.config, outputs_class, outputs_coord
+            )
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return DetrObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks
+    such as COCO panoptic.
+    """
+)
+class DetrForSegmentation(DetrPreTrainedModel):
+    def __init__(self, config: DetrConfig):
+        super().__init__(config)
+
+        # object detection model
+        self.detr = DetrForObjectDetection(config)
+
+        # segmentation head
+        hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads
+        intermediate_channel_sizes = self.detr.model.backbone.conv_encoder.intermediate_channel_sizes
+
+        self.mask_head = DetrMaskHeadSmallConv(
+            hidden_size + number_of_heads, intermediate_channel_sizes[::-1][-3:], hidden_size
+        )
+
+        self.bbox_attention = DetrMHAttentionMap(
+            hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[list[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], DetrSegmentationOutput]:
+        r"""
+        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss, DICE/F-1 loss and Focal loss. List of dicts, each
+            dictionary containing at least the following 3 keys: 'class_labels', 'boxes' and 'masks' (the class labels,
+            bounding boxes and segmentation masks of an image in the batch respectively). The class labels themselves
+            should be a `torch.LongTensor` of len `(number of bounding boxes in the image,)`, the boxes a
+            `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)` and the masks a
+            `torch.FloatTensor` of shape `(number of bounding boxes in the image, height, width)`.
+
+        Examples:
+
+        ```python
+        >>> import io
+        >>> import requests
+        >>> from PIL import Image
+        >>> import torch
+        >>> import numpy
+
+        >>> from transformers import AutoImageProcessor, DetrForSegmentation
+        >>> from transformers.image_transforms import rgb_to_id
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50-panoptic")
+        >>> model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic")
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> # forward pass
+        >>> outputs = model(**inputs)
+
+        >>> # Use the `post_process_panoptic_segmentation` method of the `image_processor` to retrieve post-processed panoptic segmentation maps
+        >>> # Segmentation results are returned as a list of dictionaries
+        >>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[(300, 500)])
+
+        >>> # A tensor of shape (height, width) where each value denotes a segment id, filled with -1 if no segment is found
+        >>> panoptic_seg = result[0]["segmentation"]
+        >>> # Get prediction score and segment_id to class_id mapping of each segment
+        >>> panoptic_segments_info = result[0]["segments_info"]
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones((batch_size, height, width), device=device)
+
+        # First, get list of feature maps and position embeddings
+        features, object_queries_list = self.detr.model.backbone(pixel_values, pixel_mask=pixel_mask)
+
+        # Second, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        feature_map, mask = features[-1]
+        batch_size, num_channels, height, width = feature_map.shape
+        projected_feature_map = self.detr.model.input_projection(feature_map)
+
+        # Third, flatten the feature map + position embeddings of shape NxCxHxW to NxCxHW, and permute it to NxHWxC
+        # In other words, turn their shape into (batch_size, sequence_length, hidden_size)
+        flattened_features = projected_feature_map.flatten(2).permute(0, 2, 1)
+        object_queries = object_queries_list[-1].flatten(2).permute(0, 2, 1)
+
+        flattened_mask = mask.flatten(1)
+
+        # Fourth, sent flattened_features + flattened_mask + position embeddings through encoder
+        # flattened_features is a Tensor of shape (batch_size, height*width, hidden_size)
+        # flattened_mask is a Tensor of shape (batch_size, height*width)
+        if encoder_outputs is None:
+            encoder_outputs = self.detr.model.encoder(
+                inputs_embeds=flattened_features,
+                attention_mask=flattened_mask,
+                object_queries=object_queries,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, sent query embeddings + position embeddings through the decoder (which is conditioned on the encoder output)
+        query_position_embeddings = self.detr.model.query_position_embeddings.weight.unsqueeze(0).repeat(
+            batch_size, 1, 1
+        )
+        queries = torch.zeros_like(query_position_embeddings)
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn)
+        decoder_outputs = self.detr.model.decoder(
+            inputs_embeds=queries,
+            attention_mask=None,
+            object_queries=object_queries,
+            query_position_embeddings=query_position_embeddings,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=flattened_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        # Sixth, compute logits, pred_boxes and pred_masks
+        logits = self.detr.class_labels_classifier(sequence_output)
+        pred_boxes = self.detr.bbox_predictor(sequence_output).sigmoid()
+
+        memory = encoder_outputs[0].permute(0, 2, 1).view(batch_size, self.config.d_model, height, width)
+        mask = flattened_mask.view(batch_size, height, width)
+
+        # FIXME h_boxes takes the last one computed, keep this in mind
+        # important: we need to reverse the mask, since in the original implementation the mask works reversed
+        # bbox_mask is of shape (batch_size, num_queries, number_of_attention_heads in bbox_attention, height/32, width/32)
+        bbox_mask = self.bbox_attention(sequence_output, memory, mask=~mask)
+
+        seg_masks = self.mask_head(projected_feature_map, bbox_mask, [features[2][0], features[1][0], features[0][0]])
+
+        pred_masks = seg_masks.view(batch_size, self.detr.config.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            outputs_class, outputs_coord = None, None
+            if self.config.auxiliary_loss:
+                intermediate = decoder_outputs.intermediate_hidden_states if return_dict else decoder_outputs[-1]
+                outputs_class = self.detr.class_labels_classifier(intermediate)
+                outputs_coord = self.detr.bbox_predictor(intermediate).sigmoid()
+            loss, loss_dict, auxiliary_outputs = self.loss_function(
+                logits, labels, device, pred_boxes, pred_masks, self.config, outputs_class, outputs_coord
+            )
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes, pred_masks) + auxiliary_outputs + decoder_outputs + encoder_outputs
+            else:
+                output = (logits, pred_boxes, pred_masks) + decoder_outputs + encoder_outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return DetrSegmentationOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            pred_masks=pred_masks,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+def _expand(tensor, length: int):
+    return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1)
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/segmentation.py
+class DetrMaskHeadSmallConv(nn.Module):
+    """
+    Simple convolutional head, using group norm. Upsampling is done using a FPN approach
+    """
+
+    def __init__(self, dim, fpn_dims, context_dim):
+        super().__init__()
+
+        if dim % 8 != 0:
+            raise ValueError(
+                "The hidden_size + number of attention heads must be divisible by 8 as the number of groups in"
+                " GroupNorm is set to 8"
+            )
+
+        inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
+
+        self.lay1 = nn.Conv2d(dim, dim, 3, padding=1)
+        self.gn1 = nn.GroupNorm(8, dim)
+        self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1)
+        self.gn2 = nn.GroupNorm(min(8, inter_dims[1]), inter_dims[1])
+        self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1)
+        self.gn3 = nn.GroupNorm(min(8, inter_dims[2]), inter_dims[2])
+        self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1)
+        self.gn4 = nn.GroupNorm(min(8, inter_dims[3]), inter_dims[3])
+        self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1)
+        self.gn5 = nn.GroupNorm(min(8, inter_dims[4]), inter_dims[4])
+        self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1)
+
+        self.dim = dim
+
+        self.adapter1 = nn.Conv2d(fpn_dims[0], inter_dims[1], 1)
+        self.adapter2 = nn.Conv2d(fpn_dims[1], inter_dims[2], 1)
+        self.adapter3 = nn.Conv2d(fpn_dims[2], inter_dims[3], 1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor, bbox_mask: Tensor, fpns: list[Tensor]):
+        # here we concatenate x, the projected feature map, of shape (batch_size, d_model, height/32, width/32) with
+        # the bbox_mask = the attention maps of shape (batch_size, n_queries, n_heads, height/32, width/32).
+        # We expand the projected feature map to match the number of heads.
+        x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
+
+        x = self.lay1(x)
+        x = self.gn1(x)
+        x = nn.functional.relu(x)
+        x = self.lay2(x)
+        x = self.gn2(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter1(fpns[0])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay3(x)
+        x = self.gn3(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter2(fpns[1])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay4(x)
+        x = self.gn4(x)
+        x = nn.functional.relu(x)
+
+        cur_fpn = self.adapter3(fpns[2])
+        if cur_fpn.size(0) != x.size(0):
+            cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0))
+        x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest")
+        x = self.lay5(x)
+        x = self.gn5(x)
+        x = nn.functional.relu(x)
+
+        x = self.out_lay(x)
+        return x
+
+
+class DetrMHAttentionMap(nn.Module):
+    """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)"""
+
+    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.dropout = nn.Dropout(dropout)
+
+        self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+        self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias)
+
+        self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5
+
+    def forward(self, q, k, mask: Optional[Tensor] = None):
+        q = self.q_linear(q)
+        k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
+        queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
+        keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
+        weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head)
+
+        if mask is not None:
+            weights = weights.masked_fill(mask.unsqueeze(1).unsqueeze(1), torch.finfo(weights.dtype).min)
+        weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size())
+        weights = self.dropout(weights)
+        return weights
+
+
+__all__ = [
+    "DetrForObjectDetection",
+    "DetrForSegmentation",
+    "DetrModel",
+    "DetrPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93be9e5f1e3588c01952c1d87a70ef6ce500ecc4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/configuration_dinov3_convnext.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/configuration_dinov3_convnext.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b586d9fef055c85d750e118f2d830f7eb4064994
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/configuration_dinov3_convnext.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/modeling_dinov3_convnext.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/modeling_dinov3_convnext.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..09ddedc251f4620bb5ba2126e8a6d9e71a6c1879
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_convnext/__pycache__/modeling_dinov3_convnext.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1e99600d7c072ae8765a118a2cd144593b89e04
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/configuration_dinov3_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/configuration_dinov3_vit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b344960e6ce231f9d6dd63215971482dce5a7103
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/configuration_dinov3_vit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/image_processing_dinov3_vit_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/image_processing_dinov3_vit_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb550e61c6940f3d454202c26aceab907300592e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/image_processing_dinov3_vit_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/modeling_dinov3_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/modeling_dinov3_vit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22b0622c27ceda17f08536c94ccb4acb625c03d7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/modeling_dinov3_vit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/modular_dinov3_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/modular_dinov3_vit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56b4b7e884c57aa96066e57d8bb18d0fd1786bd7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov3_vit/__pycache__/modular_dinov3_vit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e5be363e1b2fba80cccee13a65973823e32610b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/configuration_distilbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/configuration_distilbert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..549d3adfb3bcfa036406792bf2903ff5ea9271d6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/configuration_distilbert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_distilbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_distilbert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79b8e2a1008b2abcda69eca57e01bcd2f71fd4c1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_distilbert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_flax_distilbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_flax_distilbert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..068afca20bca434bbbbe076421f498161cb2ccfc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_flax_distilbert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_tf_distilbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_tf_distilbert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c1885fc10cb82d1e777c29d3131ddfb068b7be5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/modeling_tf_distilbert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d29c8a431e95f4b85af7979efe21f71e86257c16
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1db797287c4afed001d62c7093d4fe97a864746d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/distilbert/__pycache__/tokenization_distilbert_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dit/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dit/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47beca258c26eb2cbee9ae5dcd4510cf21d0791e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dit/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8652623285ebf5b40684c48e2a2012d6a829ad2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/configuration_doge.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/configuration_doge.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8007eb0d4c644adf6d1d24b67678fa410e0a5182
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/configuration_doge.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/modeling_doge.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/modeling_doge.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7487dc99f8a258053173c016db9d93241140a728
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/modeling_doge.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/modular_doge.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/modular_doge.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38b2479ea69e3e5eca3e0cd497495bf0792ea909
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/doge/__pycache__/modular_doge.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce4e6fd237c20dd953970f619cc05687fba86c45
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/configuration_donut_swin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/configuration_donut_swin.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e89d2f97b8337befafcce628de4a6d36678bd18
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/configuration_donut_swin.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/feature_extraction_donut.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/feature_extraction_donut.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3982c72c0798f93ade8f59b71feb476b8940c32d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/feature_extraction_donut.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/image_processing_donut.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/image_processing_donut.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..409ffbec111390e5f31e4e945f97385b1f83f539
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/image_processing_donut.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/image_processing_donut_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/image_processing_donut_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..123043725bacea7395b80b3586c7fe709b106bcf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/image_processing_donut_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/modeling_donut_swin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/modeling_donut_swin.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1f7b2a69f389abd390b15cb11579e59878bd94c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/modeling_donut_swin.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/processing_donut.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/processing_donut.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1762e1424b76626844937606610c571da0a6ea2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/donut/__pycache__/processing_donut.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e24155031b1bf4d68e99ad5f95bf4983f905043a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/configuration_dpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/configuration_dpt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c95ed77320da454e12484919e297fc2e4ffb8cd4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/configuration_dpt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/feature_extraction_dpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/feature_extraction_dpt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..425abfc6de1ee98a488b66a86ab21a3ba967fc99
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/feature_extraction_dpt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..658c6c6e846dbf1908daef2dafc3d6a2105aa222
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c640d810b48bdce483ca3d9cdc838403aa97db75
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/modeling_dpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/modeling_dpt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5267484d2063c29e718a37ccef484196207fb946
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/modeling_dpt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/modular_dpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/modular_dpt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84979fd784ad1785c3cd57dc7ed9c5096c8f3de4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpt/__pycache__/modular_dpt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..122cd1a9f6acac7bb722141c418c9144cbaa0d68
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/configuration_efficientloftr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/configuration_efficientloftr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ced9484153c8b2b27e7e180a685bd8de44f980cb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/configuration_efficientloftr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/image_processing_efficientloftr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/image_processing_efficientloftr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da28b3ab0e9965143e3b327da4b99c2075cce626
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/image_processing_efficientloftr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/image_processing_efficientloftr_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/image_processing_efficientloftr_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66a2d002f8c0b45480682a56c154ea34f9f21f4f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/image_processing_efficientloftr_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/modeling_efficientloftr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/modeling_efficientloftr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e763d63c7aa4724f135d9d457060e1b24c127696
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientloftr/__pycache__/modeling_efficientloftr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c219f75ffd2eb219985be6968e9176a4c095f638
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/configuration_efficientnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/configuration_efficientnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c38b3c5944ac0b1e967c817eb80e60ec01386039
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/configuration_efficientnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3e31f74d6f64e3ef2b35090174810bd5039ce30
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5507a43b746def945e48a9ed90c66c15a3bf7cb9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/image_processing_efficientnet_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/modeling_efficientnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/modeling_efficientnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf3c11e4330eb341c8e515fef3ec797b8f057fc7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/efficientnet/__pycache__/modeling_efficientnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee253b7b557c34b98ca22323e74a5a8afff2cbb8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/configuration_ernie.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/configuration_ernie.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2ca02658cd35fb004236ce7451806fc24ecfcf8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/configuration_ernie.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/modeling_ernie.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/modeling_ernie.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2e715f93241e5cf9b6537ab4c5f47fd9d904a75
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie/__pycache__/modeling_ernie.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c6f92fecece3367b9ef64c39e9fb183fa2ec7f9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/configuration_ernie4_5_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/configuration_ernie4_5_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3dc0447909984bd21a411654459bbb4c660b6f95
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/configuration_ernie4_5_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/modeling_ernie4_5_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/modeling_ernie4_5_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f779cd0aee0cb35ea9c52f146dd3e138b41373ac
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/modeling_ernie4_5_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/modular_ernie4_5_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/modular_ernie4_5_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4a614fecb6e8e85d374cf49a12c3c8df1bf4431
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ernie4_5_moe/__pycache__/modular_ernie4_5_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/processing_evolla.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/processing_evolla.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a343b38b039690e19562170d1b9b985b4b26201
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/processing_evolla.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c63c9ef5db5ecdcf1e53633ed266026cf7c71d4d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/configuration_falcon_mamba.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/configuration_falcon_mamba.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6847c790e11a5e2a8e23c23781dabf4b76656641
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/configuration_falcon_mamba.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e981d9cbcb1e456c206b3bec252df1598e23575a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_flaubert import *
+    from .modeling_flaubert import *
+    from .modeling_tf_flaubert import *
+    from .tokenization_flaubert import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/configuration_flaubert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/configuration_flaubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..071a74fe69b420954ff6ce7154b227c0e0d7e4dd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/configuration_flaubert.py
@@ -0,0 +1,235 @@
+# coding=utf-8
+# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flaubert configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class FlaubertConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`FlaubertModel`] or a [`TFFlaubertModel`]. It is
+    used to instantiate a FlauBERT model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the FlauBERT
+    [flaubert/flaubert_base_uncased](https://huggingface.co/flaubert/flaubert_base_uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        pre_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply the layer normalization before or after the feed forward layer following the attention in
+            each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand with
+            Structured Dropout. ICLR 2020)
+        vocab_size (`int`, *optional*, defaults to 30145):
+            Vocabulary size of the FlauBERT model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`FlaubertModel`] or [`TFFlaubertModel`].
+        emb_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention mechanism
+        gelu_activation (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a *gelu* activation instead of *relu*.
+        sinusoidal_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.
+        causal (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in
+            order to only attend to the left-side context instead if a bidirectional context.
+        asm (`bool`, *optional*, defaults to `False`):
+            Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction
+            layer.
+        n_langs (`int`, *optional*, defaults to 1):
+            The number of languages the model handles. Set to 1 for monolingual models.
+        use_lang_emb (`bool`, *optional*, defaults to `True`)
+            Whether to use language embeddings. Some models use additional language embeddings, see [the multilingual
+            models page](http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings) for information
+            on how to use them.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        embed_init_std (`float`, *optional*, defaults to 2048^-0.5):
+            The standard deviation of the truncated_normal_initializer for initializing the embedding matrices.
+        init_std (`int`, *optional*, defaults to 50257):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the
+            embedding matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        bos_index (`int`, *optional*, defaults to 0):
+            The index of the beginning of sentence token in the vocabulary.
+        eos_index (`int`, *optional*, defaults to 1):
+            The index of the end of sentence token in the vocabulary.
+        pad_index (`int`, *optional*, defaults to 2):
+            The index of the padding token in the vocabulary.
+        unk_index (`int`, *optional*, defaults to 3):
+            The index of the unknown token in the vocabulary.
+        mask_index (`int`, *optional*, defaults to 5):
+            The index of the masking token in the vocabulary.
+        is_encoder(`bool`, *optional*, defaults to `True`):
+            Whether or not the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
+        summary_type (`string`, *optional*, defaults to "first"):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Has to be one of the following options:
+
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
+            Used in the sequence classification and multiple choice models.
+
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
+            Used in the sequence classification and multiple choice models.
+
+            The dropout ratio to be used after the projection and activation.
+        start_n_top (`int`, *optional*, defaults to 5):
+            Used in the SQuAD evaluation script.
+        end_n_top (`int`, *optional*, defaults to 5):
+            Used in the SQuAD evaluation script.
+        mask_token_id (`int`, *optional*, defaults to 0):
+            Model agnostic parameter to identify masked tokens when generating text in an MLM context.
+        lang_id (`int`, *optional*, defaults to 1):
+            The ID of the language used by the model. This parameter is used when generating text in a given language.
+    """
+
+    model_type = "flaubert"
+    attribute_map = {
+        "hidden_size": "emb_dim",
+        "num_attention_heads": "n_heads",
+        "num_hidden_layers": "n_layers",
+        "n_words": "vocab_size",  # For backward compatibility
+    }
+
+    def __init__(
+        self,
+        pre_norm=False,
+        layerdrop=0.0,
+        vocab_size=30145,
+        emb_dim=2048,
+        n_layers=12,
+        n_heads=16,
+        dropout=0.1,
+        attention_dropout=0.1,
+        gelu_activation=True,
+        sinusoidal_embeddings=False,
+        causal=False,
+        asm=False,
+        n_langs=1,
+        use_lang_emb=True,
+        max_position_embeddings=512,
+        embed_init_std=2048**-0.5,
+        layer_norm_eps=1e-12,
+        init_std=0.02,
+        bos_index=0,
+        eos_index=1,
+        pad_index=2,
+        unk_index=3,
+        mask_index=5,
+        is_encoder=True,
+        summary_type="first",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        start_n_top=5,
+        end_n_top=5,
+        mask_token_id=0,
+        lang_id=0,
+        pad_token_id=2,
+        bos_token_id=0,
+        **kwargs,
+    ):
+        """Constructs FlaubertConfig."""
+        self.pre_norm = pre_norm
+        self.layerdrop = layerdrop
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.gelu_activation = gelu_activation
+        self.sinusoidal_embeddings = sinusoidal_embeddings
+        self.causal = causal
+        self.asm = asm
+        self.n_langs = n_langs
+        self.use_lang_emb = use_lang_emb
+        self.layer_norm_eps = layer_norm_eps
+        self.bos_index = bos_index
+        self.eos_index = eos_index
+        self.pad_index = pad_index
+        self.unk_index = unk_index
+        self.mask_index = mask_index
+        self.is_encoder = is_encoder
+        self.max_position_embeddings = max_position_embeddings
+        self.embed_init_std = embed_init_std
+        self.init_std = init_std
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.summary_first_dropout = summary_first_dropout
+        self.start_n_top = start_n_top
+        self.end_n_top = end_n_top
+        self.mask_token_id = mask_token_id
+        self.lang_id = lang_id
+
+        if "n_words" in kwargs:
+            self.n_words = kwargs["n_words"]
+
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
+
+
+class FlaubertOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["FlaubertConfig", "FlaubertOnnxConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/modeling_flaubert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/modeling_flaubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dadc6f5377b4007c78f44d81d7a39acf2dedbdb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/modeling_flaubert.py
@@ -0,0 +1,1700 @@
+# coding=utf-8
+# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Flaubert model, based on XLM."""
+
+import math
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import gelu, get_activation
+from ...cache_utils import DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_outputs import (
+    BaseModelOutput,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import ModelOutput, auto_docstring, logging
+from .configuration_flaubert import FlaubertConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.xlm.modeling_xlm.create_sinusoidal_embeddings
+def create_sinusoidal_embeddings(n_pos, dim, out):
+    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
+    out.requires_grad = False
+    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+    out.detach_()
+
+
+# Copied from transformers.models.xlm.modeling_xlm.get_masks
+def get_masks(slen, lengths, causal, padding_mask=None):
+    """
+    Generate hidden states mask, and optionally an attention mask.
+    """
+    alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
+    if padding_mask is not None:
+        mask = padding_mask
+    else:
+        assert lengths.max().item() <= slen
+        mask = alen < lengths[:, None]
+
+    # attention mask is the same as mask, or triangular inferior attention (causal)
+    bs = lengths.size(0)
+    if causal:
+        attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
+    else:
+        attn_mask = mask
+
+    # sanity check
+    assert mask.size() == (bs, slen)
+    assert causal is False or attn_mask.size() == (bs, slen, slen)
+
+    return mask, attn_mask
+
+
+# Copied from transformers.models.xlm.modeling_xlm.MultiHeadAttention
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_heads, dim, config, layer_idx: int = 0):
+        super().__init__()
+        self.layer_id = layer_idx
+        self.dim = dim
+        self.n_heads = n_heads
+        self.head_dim = dim // n_heads
+        self.dropout = config.attention_dropout
+        assert self.dim % self.n_heads == 0
+
+        self.q_lin = nn.Linear(dim, dim)
+        self.k_lin = nn.Linear(dim, dim)
+        self.v_lin = nn.Linear(dim, dim)
+        self.out_lin = nn.Linear(dim, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        attention_head_size = self.dim // self.n_heads
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(heads, self.n_heads, attention_head_size, self.pruned_heads)
+        # Prune linear layers
+        self.q_lin = prune_linear_layer(self.q_lin, index)
+        self.k_lin = prune_linear_layer(self.k_lin, index)
+        self.v_lin = prune_linear_layer(self.v_lin, index)
+        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.dim = attention_head_size * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        input,
+        mask,
+        kv=None,
+        cache=None,
+        head_mask=None,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = input.size()
+        is_cross_attention = kv is not None
+        mask_reshape = (bs, 1, qlen, -1) if mask.dim() == 3 else (bs, 1, 1, -1)
+
+        q = self.q_lin(input).view(bs, -1, self.n_heads, self.head_dim).transpose(1, 2)
+        if cache is not None:
+            if isinstance(cache, EncoderDecoderCache):
+                is_updated = cache.is_updated.get(self.layer_id)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = cache.cross_attention_cache
+                else:
+                    curr_past_key_value = cache.self_attention_cache
+            else:
+                curr_past_key_value = cache
+
+        current_states = kv if is_cross_attention else input
+        if is_cross_attention and cache is not None and is_updated:
+            # reuse k,v, cross_attentions
+            k = curr_past_key_value.key_cache[self.layer_id]
+            v = curr_past_key_value.value_cache[self.layer_id]
+        else:
+            k = self.k_lin(current_states)
+            v = self.v_lin(current_states)
+            k = k.view(bs, -1, self.n_heads, self.head_dim).transpose(1, 2)
+            v = v.view(bs, -1, self.n_heads, self.head_dim).transpose(1, 2)
+
+            if cache is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                k, v = curr_past_key_value.update(k, v, self.layer_id, {"cache_position": cache_position})
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention:
+                    cache.is_updated[self.layer_id] = True
+
+        q = q / math.sqrt(self.head_dim)  # (bs, n_heads, qlen, head_dim)
+        scores = torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, qlen, klen)
+        mask = (mask == 0).view(mask_reshape).expand_as(scores)  # (bs, n_heads, qlen, klen)
+        scores.masked_fill_(mask, torch.finfo(scores.dtype).min)  # (bs, n_heads, qlen, klen)
+
+        weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)  # (bs, n_heads, qlen, klen)
+        weights = nn.functional.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = torch.matmul(weights, v)  # (bs, n_heads, qlen, head_dim)
+        context = context.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * self.head_dim)
+
+        outputs = (self.out_lin(context),)
+        if output_attentions:
+            outputs = outputs + (weights,)
+        return outputs
+
+
+# Copied from transformers.models.xlm.modeling_xlm.TransformerFFN
+class TransformerFFN(nn.Module):
+    def __init__(self, in_dim, dim_hidden, out_dim, config):
+        super().__init__()
+        self.dropout = config.dropout
+        self.lin1 = nn.Linear(in_dim, dim_hidden)
+        self.lin2 = nn.Linear(dim_hidden, out_dim)
+        self.act = gelu if config.gelu_activation else nn.functional.relu
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+
+    def forward(self, input):
+        return apply_chunking_to_forward(self.ff_chunk, self.chunk_size_feed_forward, self.seq_len_dim, input)
+
+    def ff_chunk(self, input):
+        x = self.lin1(input)
+        x = self.act(x)
+        x = self.lin2(x)
+        x = nn.functional.dropout(x, p=self.dropout, training=self.training)
+        return x
+
+
+@auto_docstring(
+    custom_intro="""
+    The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.
+    """
+)
+# Copied from transformers.models.xlm.modeling_xlm.XLMPredLayer with XLM->Flaubert
+class FlaubertPredLayer(nn.Module):
+    """
+    Prediction layer (cross_entropy or adaptive_softmax).
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.asm = config.asm
+        self.n_words = config.n_words
+        self.pad_index = config.pad_index
+        dim = config.emb_dim
+
+        if config.asm is False:
+            self.proj = nn.Linear(dim, config.n_words, bias=True)
+        else:
+            self.proj = nn.AdaptiveLogSoftmaxWithLoss(
+                in_features=dim,
+                n_classes=config.n_words,
+                cutoffs=config.asm_cutoffs,
+                div_value=config.asm_div_value,
+                head_bias=True,  # default is False
+            )
+
+    def forward(self, x, y=None):
+        """Compute the loss, and optionally the scores."""
+        outputs = ()
+        if self.asm is False:
+            scores = self.proj(x)
+            outputs = (scores,) + outputs
+            if y is not None:
+                loss = nn.functional.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="mean")
+                outputs = (loss,) + outputs
+        else:
+            scores = self.proj.log_prob(x)
+            outputs = (scores,) + outputs
+            if y is not None:
+                _, loss = self.proj(x, y)
+                outputs = (loss,) + outputs
+
+        return outputs
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of question answering models using a [`~modeling_utils.FlaubertSQuADHead`].
+    """
+)
+# Copied from transformers.models.xlm.modeling_xlm.XLMSquadHeadOutput with XLM->Flaubert
+class FlaubertSquadHeadOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
+        Classification loss as the sum of start token, end token (and is_impossible if provided) classification
+        losses.
+    start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+    start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Indices for the top config.start_n_top start token possibilities (beam-search).
+    end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
+        (beam-search).
+    end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
+    cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Log probabilities for the `is_impossible` label of the answers.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    start_top_log_probs: Optional[torch.FloatTensor] = None
+    start_top_index: Optional[torch.LongTensor] = None
+    end_top_log_probs: Optional[torch.FloatTensor] = None
+    end_top_index: Optional[torch.LongTensor] = None
+    cls_logits: Optional[torch.FloatTensor] = None
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerStartLogits with XLM->Flaubert
+class FlaubertPoolerStartLogits(nn.Module):
+    """
+    Compute SQuAD start logits from sequence hidden states.
+
+    Args:
+        config ([`FlaubertConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model.
+    """
+
+    def __init__(self, config: FlaubertConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                The final hidden states of the model.
+            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
+                should be masked.
+
+        Returns:
+            `torch.FloatTensor`: The start logits for SQuAD.
+        """
+        x = self.dense(hidden_states).squeeze(-1)
+
+        if p_mask is not None:
+            if p_mask.dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
+                x = x * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerEndLogits with XLM->Flaubert
+class FlaubertPoolerEndLogits(nn.Module):
+    """
+    Compute SQuAD end logits from sequence hidden states.
+
+    Args:
+        config ([`FlaubertConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
+            to use.
+    """
+
+    def __init__(self, config: FlaubertConfig):
+        super().__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dense_1 = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        start_states: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        p_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                The final hidden states of the model.
+            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
+                The hidden states of the first tokens for the labeled span.
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                The position of the first token for the labeled span.
+            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
+                Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
+                should be masked.
+
+        <Tip>
+
+        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
+        `start_states`.
+
+        </Tip>
+
+        Returns:
+            `torch.FloatTensor`: The end logits for SQuAD.
+        """
+        assert start_states is not None or start_positions is not None, (
+            "One of start_states, start_positions should be not None"
+        )
+        if start_positions is not None:
+            slen, hsz = hidden_states.shape[-2:]
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions)  # shape (bsz, 1, hsz)
+            start_states = start_states.expand(-1, slen, -1)  # shape (bsz, slen, hsz)
+
+        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
+        x = self.activation(x)
+        x = self.LayerNorm(x)
+        x = self.dense_1(x).squeeze(-1)
+
+        if p_mask is not None:
+            if p_mask.dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
+                x = x * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMPoolerAnswerClass with XLM->Flaubert
+class FlaubertPoolerAnswerClass(nn.Module):
+    """
+    Compute SQuAD 2.0 answer class from classification and start tokens hidden states.
+
+    Args:
+        config ([`FlaubertConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model.
+    """
+
+    def __init__(self, config: FlaubertConfig):
+        super().__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        start_states: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        cls_index: Optional[torch.LongTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+                The final hidden states of the model.
+            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
+                The hidden states of the first tokens for the labeled span.
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                The position of the first token for the labeled span.
+            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
+
+        <Tip>
+
+        One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides
+        `start_states`.
+
+        </Tip>
+
+        Returns:
+            `torch.FloatTensor`: The SQuAD 2.0 answer class.
+        """
+        # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
+        hsz = hidden_states.shape[-1]
+        assert start_states is not None or start_positions is not None, (
+            "One of start_states, start_positions should be not None"
+        )
+        if start_positions is not None:
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
+
+        if cls_index is not None:
+            cls_index = cls_index[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, hsz)
+        else:
+            cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
+
+        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
+        x = self.activation(x)
+        x = self.dense_1(x).squeeze(-1)
+
+        return x
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMSQuADHead with XLM->Flaubert
+class FlaubertSQuADHead(nn.Module):
+    r"""
+    A SQuAD head inspired by XLNet.
+
+    Args:
+        config ([`FlaubertConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps`
+            to use.
+    """
+
+    def __init__(self, config: FlaubertConfig):
+        super().__init__()
+        self.start_n_top = config.start_n_top
+        self.end_n_top = config.end_n_top
+
+        self.start_logits = FlaubertPoolerStartLogits(config)
+        self.end_logits = FlaubertPoolerEndLogits(config)
+        self.answer_class = FlaubertPoolerAnswerClass(config)
+
+    @auto_docstring
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        cls_index: Optional[torch.LongTensor] = None,
+        is_impossible: Optional[torch.LongTensor] = None,
+        p_mask: Optional[torch.FloatTensor] = None,
+        return_dict: bool = False,
+    ) -> Union[FlaubertSquadHeadOutput, tuple[torch.FloatTensor]]:
+        r"""
+        hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
+            Final hidden states of the model on the sequence tokens.
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Positions of the first token for the labeled span.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Positions of the last token for the labeled span.
+        cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
+        is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Whether the question has a possible answer in the paragraph or not.
+        p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
+            Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
+            should be masked.
+        """
+        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
+
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, let's remove the dimension added by batch splitting
+            for x in (start_positions, end_positions, cls_index, is_impossible):
+                if x is not None and x.dim() > 1:
+                    x.squeeze_(-1)
+
+            # during training, compute the end logits based on the ground truth of the start position
+            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
+
+            loss_fct = CrossEntropyLoss()
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+            if cls_index is not None and is_impossible is not None:
+                # Predict answerability from the representation of CLS and START
+                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
+                loss_fct_cls = nn.BCEWithLogitsLoss()
+                cls_loss = loss_fct_cls(cls_logits, is_impossible)
+
+                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
+                total_loss += cls_loss * 0.5
+
+            return FlaubertSquadHeadOutput(loss=total_loss) if return_dict else (total_loss,)
+
+        else:
+            # during inference, compute the end logits based on beam search
+            bsz, slen, hsz = hidden_states.size()
+            start_log_probs = nn.functional.softmax(start_logits, dim=-1)  # shape (bsz, slen)
+
+            start_top_log_probs, start_top_index = torch.topk(
+                start_log_probs, self.start_n_top, dim=-1
+            )  # shape (bsz, start_n_top)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
+
+            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
+                start_states
+            )  # shape (bsz, slen, start_n_top, hsz)
+            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
+            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
+            end_log_probs = nn.functional.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
+
+            end_top_log_probs, end_top_index = torch.topk(
+                end_log_probs, self.end_n_top, dim=1
+            )  # shape (bsz, end_n_top, start_n_top)
+            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
+            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
+
+            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
+            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
+
+            if not return_dict:
+                return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
+            else:
+                return FlaubertSquadHeadOutput(
+                    start_top_log_probs=start_top_log_probs,
+                    start_top_index=start_top_index,
+                    end_top_log_probs=end_top_log_probs,
+                    end_top_index=end_top_index,
+                    cls_logits=cls_logits,
+                )
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->Flaubert
+class FlaubertSequenceSummary(nn.Module):
+    r"""
+    Compute a single vector summary of a sequence hidden states.
+
+    Args:
+        config ([`FlaubertConfig`]):
+            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
+            config class of your model for the default values it uses):
+
+            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
+
+                - `"last"` -- Take the last token hidden state (like XLNet)
+                - `"first"` -- Take the first token hidden state (like Bert)
+                - `"mean"` -- Take the mean of all tokens hidden states
+                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+                - `"attn"` -- Not implemented now, use multi-head attention
+
+            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
+              (otherwise to `config.hidden_size`).
+            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
+              another string or `None` will add no activation.
+            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
+            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
+    """
+
+    def __init__(self, config: FlaubertConfig):
+        super().__init__()
+
+        self.summary_type = getattr(config, "summary_type", "last")
+        if self.summary_type == "attn":
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.summary = nn.Identity()
+        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+
+        activation_string = getattr(config, "summary_activation", None)
+        self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity()
+
+        self.first_dropout = nn.Identity()
+        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = nn.Identity()
+        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Compute a single vector summary of a sequence hidden states.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
+                The hidden states of the last layer.
+            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
+                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
+
+        Returns:
+            `torch.FloatTensor`: The summary of the sequence hidden states.
+        """
+        if self.summary_type == "last":
+            output = hidden_states[:, -1]
+        elif self.summary_type == "first":
+            output = hidden_states[:, 0]
+        elif self.summary_type == "mean":
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == "cls_index":
+            if cls_index is None:
+                cls_index = torch.full_like(
+                    hidden_states[..., :1, :],
+                    hidden_states.shape[-2] - 1,
+                    dtype=torch.long,
+                )
+            else:
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+        elif self.summary_type == "attn":
+            raise NotImplementedError
+
+        output = self.first_dropout(output)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output)
+
+        return output
+
+
+@auto_docstring
+# Copied from transformers.models.xlm.modeling_xlm.XLMPreTrainedModel with XLM->Flaubert
+class FlaubertPreTrainedModel(PreTrainedModel):
+    config: FlaubertConfig
+    load_tf_weights = None
+    base_model_prefix = "transformer"
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    @property
+    def dummy_inputs(self):
+        inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+        attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        else:
+            langs_list = None
+        return {"input_ids": inputs_list, "attention_mask": attns_list, "langs": langs_list}
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Embedding):
+            if self.config is not None and self.config.embed_init_std is not None:
+                nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        if isinstance(module, nn.Linear):
+            if self.config is not None and self.config.init_std is not None:
+                nn.init.normal_(module.weight, mean=0, std=self.config.init_std)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0.0)
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, FlaubertModel) and self.config.sinusoidal_embeddings:
+            create_sinusoidal_embeddings(
+                self.config.max_position_embeddings, self.config.emb_dim, out=module.position_embeddings.weight
+            )
+
+
+@auto_docstring
+class FlaubertModel(FlaubertPreTrainedModel):
+    def __init__(self, config):  # , dico, is_encoder, with_output):
+        super().__init__(config)
+
+        # encoder / decoder, output layer
+        self.is_encoder = config.is_encoder
+        self.is_decoder = not config.is_encoder
+        if self.is_decoder:
+            raise NotImplementedError("Currently Flaubert can only be used as an encoder")
+        # self.with_output = with_output
+        self.causal = config.causal
+
+        # dictionary / languages
+        self.n_langs = config.n_langs
+        self.use_lang_emb = config.use_lang_emb
+        self.n_words = config.n_words
+        self.eos_index = config.eos_index
+        self.pad_index = config.pad_index
+        # self.dico = dico
+        # self.id2lang = config.id2lang
+        # self.lang2id = config.lang2id
+        # assert len(self.dico) == self.n_words
+        # assert len(self.id2lang) == len(self.lang2id) == self.n_langs
+
+        # model parameters
+        self.dim = config.emb_dim  # 512 by default
+        self.hidden_dim = self.dim * 4  # 2048 by default
+        self.n_heads = config.n_heads  # 8 by default
+        self.n_layers = config.n_layers
+        self.dropout = config.dropout
+        self.attention_dropout = config.attention_dropout
+        assert self.dim % self.n_heads == 0, "transformer dim must be a multiple of n_heads"
+
+        # embeddings
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
+        if config.n_langs > 1 and config.use_lang_emb:
+            self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
+        self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
+        self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps)
+
+        # transformer layers
+        self.attentions = nn.ModuleList()
+        self.layer_norm1 = nn.ModuleList()
+        self.ffns = nn.ModuleList()
+        self.layer_norm2 = nn.ModuleList()
+        # if self.is_decoder:
+        #     self.layer_norm15 = nn.ModuleList()
+        #     self.encoder_attn = nn.ModuleList()
+
+        for i in range(self.n_layers):
+            self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, config=config, layer_idx=i))
+            self.layer_norm1.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            # if self.is_decoder:
+            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
+            self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
+            self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+
+        if hasattr(config, "pruned_heads"):
+            pruned_heads = config.pruned_heads.copy().items()
+            config.pruned_heads = {}
+            for layer, heads in pruned_heads:
+                if self.attentions[int(layer)].n_heads == config.n_heads:
+                    self.prune_heads({int(layer): list(map(int, heads))})
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.layerdrop = getattr(config, "layerdrop", 0.0)
+        self.pre_norm = getattr(config, "pre_norm", False)
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    # Copied from transformers.models.xlm.modeling_xlm.XLMModel.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    # Copied from transformers.models.xlm.modeling_xlm.XLMModel.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings = new_embeddings
+
+    # Copied from transformers.models.xlm.modeling_xlm.XLMModel._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.attentions[layer].prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        lengths: Optional[torch.LongTensor] = None,
+        cache: Optional[dict[str, torch.FloatTensor]] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`:
+        cache (`dict[str, torch.FloatTensor]`, *optional*):
+            Dictionary strings to `torch.FloatTensor` that contains precomputed hidden-states (key and values in the
+            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
+            decoding. The dictionary object will be modified in-place during the forward pass to add newly computed
+            hidden-states.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # removed: src_enc=None, src_len=None
+        if input_ids is not None:
+            bs, slen = input_ids.size()
+        else:
+            bs, slen = inputs_embeds.size()[:-1]
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if cache is None:
+            cache = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+
+        if isinstance(cache, tuple):
+            cache = EncoderDecoderCache.from_legacy_cache(cache)
+
+        if lengths is None:
+            if input_ids is not None:
+                lengths = (input_ids != self.pad_index).sum(dim=1).long()
+            else:
+                lengths = torch.tensor([slen] * bs, device=device)
+        # mask = input_ids != self.pad_index
+
+        # check inputs
+        assert lengths.size(0) == bs
+        assert lengths.max().item() <= slen
+        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
+        # assert (src_enc is None) == (src_len is None)
+        # if src_enc is not None:
+        #     assert self.is_decoder
+        #     assert src_enc.size(0) == bs
+
+        # generate masks
+        mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
+        # if self.is_decoder and src_enc is not None:
+        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
+
+        # Setting the position-ids to the registered buffer in constructor, it helps
+        # when tracing the model without passing position-ids, solves
+        # issues similar to issue #5664
+        if position_ids is None:
+            if hasattr(self, "position_ids"):
+                position_ids = self.position_ids[:, :slen]
+                position_ids = position_ids.expand((bs, slen))
+            else:
+                position_ids = torch.arange(slen, dtype=torch.long, device=device)
+                position_ids = position_ids.unsqueeze(0).expand((bs, slen))
+        else:
+            assert position_ids.size() == (bs, slen)  # (slen, bs)
+            # position_ids = position_ids.transpose(0, 1)
+
+        # langs
+        if langs is not None:
+            assert langs.size() == (bs, slen)  # (slen, bs)
+            # langs = langs.transpose(0, 1)
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.n_layers)
+
+        # do not recompute cached elements
+        if cache is not None and input_ids is not None:
+            _slen = slen - cache.get_seq_length()
+            input_ids = input_ids[:, -_slen:]
+            position_ids = position_ids[:, -_slen:]
+            if langs is not None:
+                langs = langs[:, -_slen:]
+            mask = mask[:, -_slen:]
+            attn_mask = attn_mask[:, -_slen:]
+
+        # embeddings
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+
+        tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds)
+        if langs is not None and self.use_lang_emb and self.config.n_langs > 1:
+            tensor = tensor + self.lang_embeddings(langs)
+        if token_type_ids is not None:
+            tensor = tensor + self.embeddings(token_type_ids)
+        tensor = self.layer_norm_emb(tensor)
+        tensor = nn.functional.dropout(tensor, p=self.dropout, training=self.training)
+        tensor *= mask.unsqueeze(-1).to(tensor.dtype)
+
+        # transformer layers
+        hidden_states = () if output_hidden_states else None
+        attentions = () if output_attentions else None
+        for i in range(self.n_layers):
+            # LayerDrop
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            if output_hidden_states:
+                hidden_states = hidden_states + (tensor,)
+
+            # self attention
+            if not self.pre_norm:
+                attn_outputs = self.attentions[i](
+                    tensor,
+                    attn_mask,
+                    cache=cache,
+                    head_mask=head_mask[i],
+                    output_attentions=output_attentions,
+                    cache_position=cache_position,
+                )
+                attn = attn_outputs[0]
+                if output_attentions:
+                    attentions = attentions + (attn_outputs[1],)
+                attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
+                tensor = tensor + attn
+                tensor = self.layer_norm1[i](tensor)
+            else:
+                tensor_normalized = self.layer_norm1[i](tensor)
+                attn_outputs = self.attentions[i](tensor_normalized, attn_mask, cache=cache, head_mask=head_mask[i])
+                attn = attn_outputs[0]
+                if output_attentions:
+                    attentions = attentions + (attn_outputs[1],)
+                attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
+                tensor = tensor + attn
+
+            # FFN
+            if not self.pre_norm:
+                tensor = tensor + self.ffns[i](tensor)
+                tensor = self.layer_norm2[i](tensor)
+            else:
+                tensor_normalized = self.layer_norm2[i](tensor)
+                tensor = tensor + self.ffns[i](tensor_normalized)
+
+            tensor *= mask.unsqueeze(-1).to(tensor.dtype)
+
+        # Add last hidden state
+        if output_hidden_states:
+            hidden_states = hidden_states + (tensor,)
+
+        if not return_dict:
+            return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
+
+        return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
+
+
+@auto_docstring(
+    custom_intro="""
+    The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """
+)
+class FlaubertWithLMHeadModel(FlaubertPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["pred_layer.proj.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = FlaubertModel(config)
+        self.pred_layer = FlaubertPredLayer(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.pred_layer.proj
+
+    def set_output_embeddings(self, new_embeddings):
+        self.pred_layer.proj = new_embeddings
+
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        # Overwritten -- uses a language id
+
+        mask_token_id = self.config.mask_token_id
+        lang_id = self.config.lang_id
+
+        effective_batch_size = input_ids.shape[0]
+        mask_token = torch.full((effective_batch_size, 1), mask_token_id, dtype=torch.long, device=input_ids.device)
+        input_ids = torch.cat([input_ids, mask_token], dim=1)
+        if lang_id is not None:
+            langs = torch.full_like(input_ids, lang_id)
+        else:
+            langs = None
+        return {"input_ids": input_ids, "langs": langs}
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MaskedLMOutput]:
+        r"""
+        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use `attention_mask` for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`:
+        cache (`dict[str, torch.FloatTensor]`, *optional*):
+            Dictionary strings to `torch.FloatTensor` that contains precomputed hidden-states (key and values in the
+            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
+            decoding. The dictionary object will be modified in-place during the forward pass to add newly computed
+            hidden-states.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        output = transformer_outputs[0]
+        outputs = self.pred_layer(output, labels)  # (loss, logits) or (logits,) depending on if labels are provided.
+
+        if not return_dict:
+            return outputs + transformer_outputs[1:]
+
+        return MaskedLMOutput(
+            loss=outputs[0] if labels is not None else None,
+            logits=outputs[0] if labels is None else outputs[1],
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+    e.g. for GLUE tasks.
+    """
+)
+# Copied from transformers.models.xlm.modeling_xlm.XLMForSequenceClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertForSequenceClassification(FlaubertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.transformer = FlaubertModel(config)
+        self.sequence_summary = FlaubertSequenceSummary(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`.
+        cache (`dict[str, torch.FloatTensor]`, *optional*):
+            Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential
+            decoding.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring
+# Copied from transformers.models.xlm.modeling_xlm.XLMForTokenClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertForTokenClassification(FlaubertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = FlaubertModel(config)
+        self.dropout = nn.Dropout(config.dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`.
+        cache (`dict[str, torch.FloatTensor]`, *optional*):
+            Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential
+            decoding.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """
+)
+# Copied from transformers.models.xlm.modeling_xlm.XLMForQuestionAnsweringSimple with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertForQuestionAnsweringSimple(FlaubertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.transformer = FlaubertModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, QuestionAnsweringModelOutput]:
+        r"""
+        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`.
+        cache (`dict[str, torch.FloatTensor]`, *optional*):
+            Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential
+            decoding.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = transformer_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + transformer_outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of question answering models using a `SquadHead`.
+    """
+)
+# Copied from transformer.models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput with XLM->Flaubert
+class FlaubertForQuestionAnsweringOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
+        Classification loss as the sum of start token, end token (and is_impossible if provided) classification
+        losses.
+    start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+    start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Indices for the top config.start_n_top start token possibilities (beam-search).
+    end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
+        (beam-search).
+    end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
+    cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+        Log probabilities for the `is_impossible` label of the answers.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    start_top_log_probs: Optional[torch.FloatTensor] = None
+    start_top_index: Optional[torch.LongTensor] = None
+    end_top_log_probs: Optional[torch.FloatTensor] = None
+    end_top_index: Optional[torch.LongTensor] = None
+    cls_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@auto_docstring
+# Copied from transformers.models.xlm.modeling_xlm.XLMForQuestionAnswering with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertForQuestionAnswering(FlaubertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.transformer = FlaubertModel(config)
+        self.qa_outputs = FlaubertSQuADHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        is_impossible: Optional[torch.Tensor] = None,
+        cls_index: Optional[torch.Tensor] = None,
+        p_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, FlaubertForQuestionAnsweringOutput]:
+        r"""
+        langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`.
+        cache (`dict[str, torch.FloatTensor]`, *optional*):
+            Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential
+            decoding.
+        is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels whether a question has an answer or no answer (SQuAD 2.0)
+        cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the classification token to use as input for computing plausibility of the
+            answer.
+        p_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). 1.0 means token should be
+            masked. 0.0 mean token is not masked.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, FlaubertForQuestionAnswering
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-mlm-en-2048")
+        >>> model = FlaubertForQuestionAnswering.from_pretrained("FacebookAI/xlm-mlm-en-2048")
+
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
+        ...     0
+        ... )  # Batch size 1
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+
+        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        >>> loss = outputs.loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        output = transformer_outputs[0]
+
+        outputs = self.qa_outputs(
+            output,
+            start_positions=start_positions,
+            end_positions=end_positions,
+            cls_index=cls_index,
+            is_impossible=is_impossible,
+            p_mask=p_mask,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return outputs + transformer_outputs[1:]
+
+        return FlaubertForQuestionAnsweringOutput(
+            loss=outputs.loss,
+            start_top_log_probs=outputs.start_top_log_probs,
+            start_top_index=outputs.start_top_index,
+            end_top_log_probs=outputs.end_top_log_probs,
+            end_top_index=outputs.end_top_index,
+            cls_logits=outputs.cls_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring
+# Copied from transformers.models.xlm.modeling_xlm.XLMForMultipleChoice with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class FlaubertForMultipleChoice(FlaubertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.transformer = FlaubertModel(config)
+        self.sequence_summary = FlaubertSequenceSummary(config)
+        self.logits_proj = nn.Linear(config.num_labels, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        langs: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        lengths: Optional[torch.Tensor] = None,
+        cache: Optional[dict[str, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MultipleChoiceModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        langs (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        lengths (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
+            `[0, ..., input_ids.size(-1)]`.
+        cache (`dict[str, torch.FloatTensor]`, *optional*):
+            Instance of `EncoderDecoderCache` that contains precomputed KV states. Can be used to speed up sequential
+            decoding.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        langs = langs.view(-1, langs.size(-1)) if langs is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        if lengths is not None:
+            logger.warning(
+                "The `lengths` parameter cannot be used with the Flaubert multiple choice models. Please use the "
+                "attention mask instead."
+            )
+            lengths = None
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+        logits = self.logits_proj(logits)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = [
+    "FlaubertForMultipleChoice",
+    "FlaubertForQuestionAnswering",
+    "FlaubertForQuestionAnsweringSimple",
+    "FlaubertForSequenceClassification",
+    "FlaubertForTokenClassification",
+    "FlaubertModel",
+    "FlaubertWithLMHeadModel",
+    "FlaubertPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..88b7ae9f0c9ddd96e761b3739a035c55a217dec8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/modeling_tf_flaubert.py
@@ -0,0 +1,1343 @@
+# coding=utf-8
+# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+TF 2.0 Flaubert model.
+"""
+
+from __future__ import annotations
+
+import itertools
+import random
+import warnings
+from dataclasses import dataclass
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFMultipleChoiceModelOutput,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFModelInputType,
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFSequenceSummary,
+    TFSharedEmbeddings,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    MULTIPLE_CHOICE_DUMMY_INPUTS,
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_flaubert import FlaubertConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "flaubert/flaubert_base_cased"
+_CONFIG_FOR_DOC = "FlaubertConfig"
+
+
+FLAUBERT_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`FlaubertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+FLAUBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - `1` for tokens that are **not masked**,
+            - `0` for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        langs (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+            languages ids which can be obtained from the language names by using two conversion mappings provided in
+            the configuration of the model (only provided for multilingual models). More precisely, the *language name
+            to language id* mapping is in `model.config.lang2id` (which is a dictionary string to int) and the
+            *language id to language name* mapping is in `model.config.id2lang` (dictionary int to string).
+
+            See usage examples detailed in the [multilingual documentation](../multilingual).
+        token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - `0` corresponds to a *sentence A* token,
+            - `1` corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        lengths (`tf.Tensor` or `Numpy array` of shape `(batch_size,)`, *optional*):
+            Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+            also use *attention_mask* for the same result (see above), kept here for compatibility Indices selected in
+            `[0, ..., input_ids.size(-1)]`:
+        cache (`dict[str, tf.Tensor]`, *optional*):
+            Dictionary string to `tf.FloatTensor` that contains precomputed hidden states (key and values in the
+            attention blocks) as computed by the model (see `cache` output below). Can be used to speed up sequential
+            decoding.
+
+            The dictionary object will be modified in-place during the forward pass to add newly computed
+            hidden-states.
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - `1` indicates the head is **not masked**,
+            - `0` indicates the head is **masked**.
+
+        inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+def get_masks(slen, lengths, causal, padding_mask=None):
+    """
+    Generate hidden states mask, and optionally an attention mask.
+    """
+    bs = shape_list(lengths)[0]
+    if padding_mask is not None:
+        mask = padding_mask
+    else:
+        # assert lengths.max().item() <= slen
+        alen = tf.range(slen, dtype=lengths.dtype)
+        mask = alen < tf.expand_dims(lengths, axis=1)
+
+    # attention mask is the same as mask, or triangular inferior attention (causal)
+    if causal:
+        attn_mask = tf.less_equal(
+            tf.tile(tf.reshape(alen, (1, 1, slen)), (bs, slen, 1)), tf.reshape(alen, (1, slen, 1))
+        )
+    else:
+        attn_mask = mask
+
+    # sanity check
+    # assert shape_list(mask) == [bs, slen]
+    tf.debugging.assert_equal(shape_list(mask), [bs, slen])
+    if causal:
+        tf.debugging.assert_equal(shape_list(attn_mask), [bs, slen, slen])
+
+    return mask, attn_mask
+
+
+class TFFlaubertPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = FlaubertConfig
+    base_model_prefix = "transformer"
+
+    @property
+    def dummy_inputs(self):
+        # Sometimes Flaubert has language embeddings so don't forget to build them as well if needed
+        inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]], dtype=tf.int32)
+        attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32)
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            return {
+                "input_ids": inputs_list,
+                "attention_mask": attns_list,
+                "langs": tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]], dtype=tf.int32),
+            }
+        else:
+            return {"input_ids": inputs_list, "attention_mask": attns_list}
+
+
+@add_start_docstrings(
+    "The bare Flaubert Model transformer outputting raw hidden-states without any specific head on top.",
+    FLAUBERT_START_DOCSTRING,
+)
+class TFFlaubertModel(TFFlaubertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: dict[str, tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> tuple | TFBaseModelOutput:
+        outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+
+
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMMultiHeadAttention with XLM->Flaubert
+class TFFlaubertMultiHeadAttention(keras.layers.Layer):
+    NEW_ID = itertools.count()
+
+    def __init__(self, n_heads, dim, config, **kwargs):
+        super().__init__(**kwargs)
+        self.layer_id = next(TFFlaubertMultiHeadAttention.NEW_ID)
+        self.dim = dim
+        self.n_heads = n_heads
+        self.output_attentions = config.output_attentions
+        assert self.dim % self.n_heads == 0
+
+        self.q_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="q_lin")
+        self.k_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="k_lin")
+        self.v_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="v_lin")
+        self.out_lin = keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name="out_lin")
+        self.dropout = keras.layers.Dropout(config.attention_dropout)
+        self.pruned_heads = set()
+        self.dim = dim
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(self, input, mask, kv, cache, head_mask, output_attentions, training=False):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = shape_list(input)
+
+        if kv is None:
+            klen = qlen if cache is None else cache["slen"] + qlen
+        else:
+            klen = shape_list(kv)[1]
+
+        # assert dim == self.dim, f'Dimensions do not match: {dim} input vs {self.dim} configured'
+        dim_per_head = self.dim // self.n_heads
+        mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen)
+
+        def shape(x):
+            """projection"""
+            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
+
+        def unshape(x):
+            """compute context"""
+            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
+
+        q = shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+
+        if kv is None:
+            k = shape(self.k_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k_lin(k))  # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(v))  # (bs, n_heads, qlen, dim_per_head)
+
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = tf.concat([k_, k], axis=2)  # (bs, n_heads, klen, dim_per_head)
+                    v = tf.concat([v_, v], axis=2)  # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+
+            cache[self.layer_id] = (k, v)
+
+        f_dim_per_head = tf.cast(dim_per_head, dtype=q.dtype)
+        q = tf.multiply(q, tf.math.rsqrt(f_dim_per_head))  # (bs, n_heads, qlen, dim_per_head)
+        k = tf.cast(k, dtype=q.dtype)
+        scores = tf.matmul(q, k, transpose_b=True)  # (bs, n_heads, qlen, klen)
+        mask = tf.reshape(mask, mask_reshape)  # (bs, n_heads, qlen, klen)
+        # scores.masked_fill_(mask, -float('inf'))                            # (bs, n_heads, qlen, klen)
+        mask = tf.cast(mask, dtype=scores.dtype)
+        scores = scores - 1e30 * (1.0 - mask)
+        weights = stable_softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)  # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = tf.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)  # (bs, qlen, dim)
+        outputs = (self.out_lin(context),)
+
+        if output_attentions:
+            outputs = outputs + (weights,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "q_lin", None) is not None:
+            with tf.name_scope(self.q_lin.name):
+                self.q_lin.build([None, None, self.dim])
+        if getattr(self, "k_lin", None) is not None:
+            with tf.name_scope(self.k_lin.name):
+                self.k_lin.build([None, None, self.dim])
+        if getattr(self, "v_lin", None) is not None:
+            with tf.name_scope(self.v_lin.name):
+                self.v_lin.build([None, None, self.dim])
+        if getattr(self, "out_lin", None) is not None:
+            with tf.name_scope(self.out_lin.name):
+                self.out_lin.build([None, None, self.dim])
+
+
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMTransformerFFN
+class TFFlaubertTransformerFFN(keras.layers.Layer):
+    def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.lin1 = keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name="lin1")
+        self.lin2 = keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name="lin2")
+        self.act = get_tf_activation("gelu") if config.gelu_activation else get_tf_activation("relu")
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.in_dim = in_dim
+        self.dim_hidden = dim_hidden
+
+    def call(self, input, training=False):
+        x = self.lin1(input)
+        x = self.act(x)
+        x = self.lin2(x)
+        x = self.dropout(x, training=training)
+
+        return x
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "lin1", None) is not None:
+            with tf.name_scope(self.lin1.name):
+                self.lin1.build([None, None, self.in_dim])
+        if getattr(self, "lin2", None) is not None:
+            with tf.name_scope(self.lin2.name):
+                self.lin2.build([None, None, self.dim_hidden])
+
+
+@keras_serializable
+class TFFlaubertMainLayer(keras.layers.Layer):
+    config_class = FlaubertConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.n_heads = config.n_heads
+        self.n_langs = config.n_langs
+        self.dim = config.emb_dim
+        self.hidden_dim = self.dim * 4
+        self.n_words = config.n_words
+        self.pad_index = config.pad_index
+        self.causal = config.causal
+        self.n_layers = config.n_layers
+        self.use_lang_emb = config.use_lang_emb
+        self.layerdrop = getattr(config, "layerdrop", 0.0)
+        self.pre_norm = getattr(config, "pre_norm", False)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.return_dict = config.use_return_dict
+        self.max_position_embeddings = config.max_position_embeddings
+        self.embed_init_std = config.embed_init_std
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.embeddings = TFSharedEmbeddings(
+            self.n_words, self.dim, initializer_range=config.embed_init_std, name="embeddings"
+        )
+        self.layer_norm_emb = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm_emb")
+        self.attentions = []
+        self.layer_norm1 = []
+        self.ffns = []
+        self.layer_norm2 = []
+
+        for i in range(self.n_layers):
+            self.attentions.append(
+                TFFlaubertMultiHeadAttention(self.n_heads, self.dim, config=config, name=f"attentions_._{i}")
+            )
+            self.layer_norm1.append(
+                keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm1_._{i}")
+            )
+            # if self.is_decoder:
+            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
+            self.ffns.append(
+                TFFlaubertTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name=f"ffns_._{i}")
+            )
+            self.layer_norm2.append(
+                keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name=f"layer_norm2_._{i}")
+            )
+
+    def build(self, input_shape=None):
+        with tf.name_scope("position_embeddings"):
+            self.position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_position_embeddings, self.dim],
+                initializer=get_initializer(self.embed_init_std),
+            )
+
+        if self.n_langs > 1 and self.use_lang_emb:
+            with tf.name_scope("lang_embeddings"):
+                self.lang_embeddings = self.add_weight(
+                    name="embeddings",
+                    shape=[self.n_langs, self.dim],
+                    initializer=get_initializer(self.embed_init_std),
+                )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "layer_norm_emb", None) is not None:
+            with tf.name_scope(self.layer_norm_emb.name):
+                self.layer_norm_emb.build([None, None, self.dim])
+        for layer in self.attentions:
+            with tf.name_scope(layer.name):
+                layer.build(None)
+        for layer in self.layer_norm1:
+            with tf.name_scope(layer.name):
+                layer.build([None, None, self.dim])
+        for layer in self.ffns:
+            with tf.name_scope(layer.name):
+                layer.build(None)
+        for layer in self.layer_norm2:
+            with tf.name_scope(layer.name):
+                layer.build([None, None, self.dim])
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.weight = value
+        self.embeddings.vocab_size = shape_list(value)[0]
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: dict[str, tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> tuple | TFBaseModelOutput:
+        # removed: src_enc=None, src_len=None
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            bs, slen = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            bs, slen = shape_list(inputs_embeds)[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if lengths is None:
+            if input_ids is not None:
+                lengths = tf.reduce_sum(
+                    tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=input_ids.dtype), axis=1
+                )
+            else:
+                lengths = tf.convert_to_tensor([slen] * bs)
+        # mask = input_ids != self.pad_index
+
+        # check inputs
+        # assert shape_list(lengths)[0] == bs
+        (
+            tf.debugging.assert_equal(shape_list(lengths)[0], bs),
+            f"Expected batch size {shape_list(lengths)[0]} and received batch size {bs} mismatched",
+        )
+        # assert lengths.max().item() <= slen
+        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
+        # assert (src_enc is None) == (src_len is None)
+        # if src_enc is not None:
+        #     assert self.is_decoder
+        #     assert src_enc.size(0) == bs
+
+        # generate masks
+        mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
+        # if self.is_decoder and src_enc is not None:
+        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
+
+        # position_ids
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(slen), axis=0)
+            position_ids = tf.tile(position_ids, (bs, 1))
+
+        # assert shape_list(position_ids) == [bs, slen]  # (slen, bs)
+        (
+            tf.debugging.assert_equal(shape_list(position_ids), [bs, slen]),
+            f"Position id shape {shape_list(position_ids)} and input shape {[bs, slen]} mismatched",
+        )
+        # position_ids = position_ids.transpose(0, 1)
+
+        # langs
+        if langs is not None:
+            # assert shape_list(langs) == [bs, slen]  # (slen, bs)
+            (
+                tf.debugging.assert_equal(shape_list(langs), [bs, slen]),
+                f"Lang shape {shape_list(langs)} and input shape {[bs, slen]} mismatched",
+            )
+            # langs = langs.transpose(0, 1)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.n_layers
+
+        # do not recompute cached elements
+        if cache is not None and input_ids is not None:
+            _slen = slen - cache["slen"]
+            input_ids = input_ids[:, -_slen:]
+            position_ids = position_ids[:, -_slen:]
+            if langs is not None:
+                langs = langs[:, -_slen:]
+            mask = mask[:, -_slen:]
+            attn_mask = attn_mask[:, -_slen:]
+
+        # embeddings
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.embeddings.vocab_size)
+            inputs_embeds = self.embeddings(input_ids)
+
+        tensor = inputs_embeds + tf.gather(self.position_embeddings, position_ids)
+
+        if langs is not None and self.use_lang_emb:
+            tensor = tensor + tf.gather(self.lang_embeddings, langs)
+        if token_type_ids is not None:
+            tensor = tensor + self.embeddings(token_type_ids)
+
+        tensor = self.layer_norm_emb(tensor)
+        tensor = self.dropout(tensor, training=training)
+        mask = tf.cast(mask, dtype=tensor.dtype)
+        tensor = tensor * tf.expand_dims(mask, axis=-1)
+
+        # hidden_states and attentions cannot be None in graph mode.
+        hidden_states = () if output_hidden_states else None
+        attentions = () if output_attentions else None
+
+        # transformer layers
+        for i in range(self.n_layers):
+            # LayerDrop
+            dropout_probability = random.uniform(0, 1)
+
+            if training and (dropout_probability < self.layerdrop):
+                continue
+
+            if output_hidden_states:
+                hidden_states = hidden_states + (tensor,)
+
+            # self attention
+            if not self.pre_norm:
+                attn_outputs = self.attentions[i](
+                    tensor,
+                    attn_mask,
+                    None,
+                    cache,
+                    head_mask[i],
+                    output_attentions,
+                    training=training,
+                )
+                attn = attn_outputs[0]
+
+                if output_attentions:
+                    attentions = attentions + (attn_outputs[1],)
+
+                attn = self.dropout(attn, training=training)
+                tensor = tensor + attn
+                tensor = self.layer_norm1[i](tensor)
+            else:
+                tensor_normalized = self.layer_norm1[i](tensor)
+                attn_outputs = self.attentions[i](
+                    tensor_normalized,
+                    attn_mask,
+                    None,
+                    cache,
+                    head_mask[i],
+                    output_attentions,
+                    training=training,
+                )
+                attn = attn_outputs[0]
+
+                if output_attentions:
+                    attentions = attentions + (attn_outputs[1],)
+
+                attn = self.dropout(attn, training=training)
+                tensor = tensor + attn
+
+            # encoder attention (for decoder only)
+            # if self.is_decoder and src_enc is not None:
+            #     attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
+            #     attn = nn.functional.dropout(attn, p=self.dropout, training=self.training)
+            #     tensor = tensor + attn
+            #     tensor = self.layer_norm15[i](tensor)
+
+            # FFN
+            if not self.pre_norm:
+                tensor = tensor + self.ffns[i](tensor)
+                tensor = self.layer_norm2[i](tensor)
+            else:
+                tensor_normalized = self.layer_norm2[i](tensor)
+                tensor = tensor + self.ffns[i](tensor_normalized)
+
+            tensor = tensor * tf.expand_dims(mask, axis=-1)
+
+        # Add last hidden state
+        if output_hidden_states:
+            hidden_states = hidden_states + (tensor,)
+
+        # update cache length
+        if cache is not None:
+            cache["slen"] += tensor.size(1)
+
+        # move back sequence length to dimension 0
+        # tensor = tensor.transpose(0, 1)
+
+        if not return_dict:
+            return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
+
+        return TFBaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions)
+
+
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMPredLayer
+class TFFlaubertPredLayer(keras.layers.Layer):
+    """
+    Prediction layer (cross_entropy or adaptive_softmax).
+    """
+
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+
+        self.asm = config.asm
+        self.n_words = config.n_words
+        self.pad_index = config.pad_index
+
+        if config.asm is False:
+            self.input_embeddings = input_embeddings
+        else:
+            raise NotImplementedError
+            # self.proj = nn.AdaptiveLogSoftmaxWithLoss(
+            #     in_features=dim,
+            #     n_classes=config.n_words,
+            #     cutoffs=config.asm_cutoffs,
+            #     div_value=config.asm_div_value,
+            #     head_bias=True,  # default is False
+            # )
+
+    def build(self, input_shape):
+        # The output weights are the same as the input embeddings, but there is an output-only bias for each token.
+        self.bias = self.add_weight(shape=(self.n_words,), initializer="zeros", trainable=True, name="bias")
+
+        super().build(input_shape)
+
+    def get_output_embeddings(self):
+        return self.input_embeddings
+
+    def set_output_embeddings(self, value):
+        self.input_embeddings.weight = value
+        self.input_embeddings.vocab_size = shape_list(value)[0]
+
+    def get_bias(self):
+        return {"bias": self.bias}
+
+    def set_bias(self, value):
+        self.bias = value["bias"]
+        self.vocab_size = shape_list(value["bias"])[0]
+
+    def call(self, hidden_states):
+        hidden_states = self.input_embeddings(hidden_states, mode="linear")
+        hidden_states = hidden_states + self.bias
+
+        return hidden_states
+
+
+@dataclass
+class TFFlaubertWithLMHeadModelOutput(ModelOutput):
+    """
+    Base class for [`TFFlaubertWithLMHeadModel`] outputs.
+
+    Args:
+        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    logits: tf.Tensor | None = None
+    hidden_states: tuple[tf.Tensor] | None = None
+    attentions: tuple[tf.Tensor] | None = None
+
+
+@add_start_docstrings(
+    """
+    The Flaubert Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+class TFFlaubertWithLMHeadModel(TFFlaubertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.pred_layer = TFFlaubertPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj")
+        # Flaubert does not have past caching features
+        self.supports_xla_generation = False
+
+    def get_lm_head(self):
+        return self.pred_layer
+
+    def get_prefix_bias_name(self):
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.pred_layer.name
+
+    def prepare_inputs_for_generation(self, inputs, **kwargs):
+        mask_token_id = self.config.mask_token_id
+        lang_id = self.config.lang_id
+
+        effective_batch_size = inputs.shape[0]
+        mask_token = tf.fill((effective_batch_size, 1), 1) * mask_token_id
+        inputs = tf.concat([inputs, mask_token], axis=1)
+
+        if lang_id is not None:
+            langs = tf.ones_like(inputs) * lang_id
+        else:
+            langs = None
+        return {"input_ids": inputs, "langs": langs}
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFFlaubertWithLMHeadModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: np.ndarray | tf.Tensor | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: dict[str, tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> tuple | TFFlaubertWithLMHeadModelOutput:
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        output = transformer_outputs[0]
+        outputs = self.pred_layer(output)
+
+        if not return_dict:
+            return (outputs,) + transformer_outputs[1:]
+
+        return TFFlaubertWithLMHeadModelOutput(
+            logits=outputs, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "pred_layer", None) is not None:
+            with tf.name_scope(self.pred_layer.name):
+                self.pred_layer.build(None)
+
+
+@add_start_docstrings(
+    """
+    Flaubert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+    e.g. for GLUE tasks.
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForSequenceClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class TFFlaubertForSequenceClassification(TFFlaubertPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: dict[str, tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        output = transformer_outputs[0]
+
+        logits = self.sequence_summary(output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "sequence_summary", None) is not None:
+            with tf.name_scope(self.sequence_summary.name):
+                self.sequence_summary.build(None)
+
+
+@add_start_docstrings(
+    """
+    Flaubert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForQuestionAnsweringSimple with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class TFFlaubertForQuestionAnsweringSimple(TFFlaubertPreTrainedModel, TFQuestionAnsweringLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.qa_outputs = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="qa_outputs"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: dict[str, tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]:
+        r"""
+        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = transformer_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.hf_compute_loss(labels, (start_logits, end_logits))
+
+        if not return_dict:
+            output = (start_logits, end_logits) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    Flaubert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForTokenClassification with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class TFFlaubertForTokenClassification(TFFlaubertPreTrainedModel, TFTokenClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.classifier = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(FLAUBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFTokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: dict[str, tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> TFTokenClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            langs=langs,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = transformer_outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=training)
+        logits = self.classifier(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    Flaubert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    FLAUBERT_START_DOCSTRING,
+)
+# Copied from transformers.models.xlm.modeling_tf_xlm.TFXLMForMultipleChoice with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert
+class TFFlaubertForMultipleChoice(TFFlaubertPreTrainedModel, TFMultipleChoiceLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.transformer = TFFlaubertMainLayer(config, name="transformer")
+        self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary")
+        self.logits_proj = keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj"
+        )
+        self.config = config
+
+    @property
+    def dummy_inputs(self):
+        """
+        Dummy inputs to build the network.
+
+        Returns:
+            tf.Tensor with dummy inputs
+        """
+        # Sometimes Flaubert has language embeddings so don't forget to build them as well if needed
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            return {
+                "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
+                "langs": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
+            }
+        else:
+            return {
+                "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS, dtype=tf.int32),
+            }
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(
+        FLAUBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        langs: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        lengths: np.ndarray | tf.Tensor | None = None,
+        cache: dict[str, tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> TFMultipleChoiceModelOutput | tuple[tf.Tensor]:
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+        flat_langs = tf.reshape(langs, (-1, seq_length)) if langs is not None else None
+        flat_inputs_embeds = (
+            tf.reshape(inputs_embeds, (-1, seq_length, shape_list(inputs_embeds)[3]))
+            if inputs_embeds is not None
+            else None
+        )
+
+        if lengths is not None:
+            logger.warning(
+                "The `lengths` parameter cannot be used with the Flaubert multiple choice models. Please use the "
+                "attention mask instead.",
+            )
+            lengths = None
+
+        transformer_outputs = self.transformer(
+            flat_input_ids,
+            flat_attention_mask,
+            flat_langs,
+            flat_token_type_ids,
+            flat_position_ids,
+            lengths,
+            cache,
+            head_mask,
+            flat_inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+        logits = self.logits_proj(logits)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
+
+        if not return_dict:
+            output = (reshaped_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "sequence_summary", None) is not None:
+            with tf.name_scope(self.sequence_summary.name):
+                self.sequence_summary.build(None)
+        if getattr(self, "logits_proj", None) is not None:
+            with tf.name_scope(self.logits_proj.name):
+                self.logits_proj.build([None, None, self.config.num_labels])
+
+
+__all__ = [
+    "TFFlaubertForMultipleChoice",
+    "TFFlaubertForQuestionAnsweringSimple",
+    "TFFlaubertForSequenceClassification",
+    "TFFlaubertForTokenClassification",
+    "TFFlaubertModel",
+    "TFFlaubertPreTrainedModel",
+    "TFFlaubertWithLMHeadModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/tokenization_flaubert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/tokenization_flaubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..dee653450ebacc02bcfee8142c5d08b54358c1a9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flaubert/tokenization_flaubert.py
@@ -0,0 +1,538 @@
+# coding=utf-8
+# Copyright 2019-present CNRS, Facebook Inc. and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Flaubert."""
+
+import json
+import os
+import re
+import unicodedata
+from typing import Optional
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+
+def convert_to_unicode(text):
+    """
+    Converts `text` to Unicode (if it's not already), assuming UTF-8 input.
+    """
+
+    def ensure_text(s, encoding="utf-8", errors="strict"):
+        if isinstance(s, bytes):
+            return s.decode(encoding, errors)
+        elif isinstance(s, str):
+            return s
+        else:
+            raise TypeError(f"not expecting type '{type(s)}'")
+
+    return ensure_text(text, encoding="utf-8", errors="ignore")
+
+
+# Copied from transformers.models.xlm.tokenization_xlm.get_pairs
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
+    strings)
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+# Copied from transformers.models.xlm.tokenization_xlm.replace_unicode_punct
+def replace_unicode_punct(text):
+    """
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
+    """
+    text = text.replace("，", ",")
+    text = re.sub(r"。\s*", ". ", text)
+    text = text.replace("、", ",")
+    text = text.replace("”", '"')
+    text = text.replace("“", '"')
+    text = text.replace("∶", ":")
+    text = text.replace("：", ":")
+    text = text.replace("？", "?")
+    text = text.replace("《", '"')
+    text = text.replace("》", '"')
+    text = text.replace("）", ")")
+    text = text.replace("！", "!")
+    text = text.replace("（", "(")
+    text = text.replace("；", ";")
+    text = text.replace("１", "1")
+    text = text.replace("」", '"')
+    text = text.replace("「", '"')
+    text = text.replace("０", "0")
+    text = text.replace("３", "3")
+    text = text.replace("２", "2")
+    text = text.replace("５", "5")
+    text = text.replace("６", "6")
+    text = text.replace("９", "9")
+    text = text.replace("７", "7")
+    text = text.replace("８", "8")
+    text = text.replace("４", "4")
+    text = re.sub(r"．\s*", ". ", text)
+    text = text.replace("～", "~")
+    text = text.replace("’", "'")
+    text = text.replace("…", "...")
+    text = text.replace("━", "-")
+    text = text.replace("〈", "<")
+    text = text.replace("〉", ">")
+    text = text.replace("【", "[")
+    text = text.replace("】", "]")
+    text = text.replace("％", "%")
+    return text
+
+
+# Copied from transformers.models.xlm.tokenization_xlm.remove_non_printing_char
+def remove_non_printing_char(text):
+    """
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
+    """
+    output = []
+    for char in text:
+        cat = unicodedata.category(char)
+        if cat.startswith("C"):
+            continue
+        output.append(char)
+    return "".join(output)
+
+
+class FlaubertTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Flaubert tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:
+
+    - Moses preprocessing and tokenization.
+    - Normalizing all inputs text.
+    - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
+      "__classify__") to a vocabulary.
+    - The argument `do_lowercase` controls lower casing (automatically set for pretrained vocabularies).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Vocabulary file.
+        merges_file (`str`):
+            Merges file.
+        do_lowercase (`bool`, *optional*, defaults to `False`):
+            Controls lower casing.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"</s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"<special1>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<special0>', '<special1>', '<special2>', '<special3>', '<special4>', '<special5>', '<special6>', '<special7>', '<special8>', '<special9>']`):
+            List of additional special tokens.
+        lang2id (`Dict[str, int]`, *optional*):
+            Dictionary mapping languages string identifiers to their IDs.
+        id2lang (`Dict[int, str]`, *optional*):
+            Dictionary mapping language IDs to their string identifiers.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        do_lowercase=False,
+        unk_token="<unk>",
+        bos_token="<s>",
+        sep_token="</s>",
+        pad_token="<pad>",
+        cls_token="</s>",
+        mask_token="<special1>",
+        additional_special_tokens=[
+            "<special0>",
+            "<special1>",
+            "<special2>",
+            "<special3>",
+            "<special4>",
+            "<special5>",
+            "<special6>",
+            "<special7>",
+            "<special8>",
+            "<special9>",
+        ],
+        lang2id=None,
+        id2lang=None,
+        **kwargs,
+    ):
+        do_lowercase_and_remove_accent = kwargs.pop("do_lowercase_and_remove_accent", None)
+        if do_lowercase_and_remove_accent is not None:
+            logger.warning(
+                "`do_lowercase_and_remove_accent` is passed as a keyword argument, but this won't do anything."
+                " `FlaubertTokenizer` will always set it to `False`."
+            )
+        # always `False`
+        self.do_lowercase_and_remove_accent = False
+
+        self.do_lowercase = do_lowercase
+
+        try:
+            import sacremoses
+        except ImportError:
+            raise ImportError(
+                "You need to install sacremoses to use FlaubertTokenizer. "
+                "See https://pypi.org/project/sacremoses/ for installation."
+            )
+
+        self.sm = sacremoses
+
+        # cache of sm.MosesPunctNormalizer instance
+        self.cache_moses_punct_normalizer = {}
+        # cache of sm.MosesTokenizer instance
+        self.cache_moses_tokenizer = {}
+        self.lang_with_custom_tokenizer = {"zh", "th", "ja"}
+        self.lang2id = lang2id
+        self.id2lang = id2lang
+        if lang2id is not None and id2lang is not None:
+            assert len(lang2id) == len(id2lang)
+
+        self.ja_word_tokenizer = None
+        self.zh_word_tokenizer = None
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[:-1]
+        merges = [tuple(merge.split()[:2]) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+
+        super().__init__(
+            do_lowercase=do_lowercase,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            lang2id=lang2id,
+            id2lang=id2lang,
+            **kwargs,
+        )
+
+    @property
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.do_lower_case
+    def do_lower_case(self):
+        return self.do_lowercase_and_remove_accent
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_punct_norm
+    def moses_punct_norm(self, text, lang):
+        if lang not in self.cache_moses_punct_normalizer:
+            punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
+            self.cache_moses_punct_normalizer[lang] = punct_normalizer
+        else:
+            punct_normalizer = self.cache_moses_punct_normalizer[lang]
+        return punct_normalizer.normalize(text)
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_tokenize
+    def moses_tokenize(self, text, lang):
+        if lang not in self.cache_moses_tokenizer:
+            moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
+            self.cache_moses_tokenizer[lang] = moses_tokenizer
+        else:
+            moses_tokenizer = self.cache_moses_tokenizer[lang]
+        return moses_tokenizer.tokenize(text, return_str=False, escape=False)
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_pipeline
+    def moses_pipeline(self, text, lang):
+        text = replace_unicode_punct(text)
+        text = self.moses_punct_norm(text, lang)
+        text = remove_non_printing_char(text)
+        return text
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.ja_tokenize
+    def ja_tokenize(self, text):
+        if self.ja_word_tokenizer is None:
+            try:
+                import Mykytea
+
+                self.ja_word_tokenizer = Mykytea.Mykytea(
+                    f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin"
+                )
+            except (AttributeError, ImportError):
+                logger.error(
+                    "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper"
+                    " (https://github.com/chezou/Mykytea-python) with the following steps"
+                )
+                logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
+                logger.error("2. autoreconf -i")
+                logger.error("3. ./configure --prefix=$HOME/local")
+                logger.error("4. make && make install")
+                logger.error("5. pip install kytea")
+                raise
+        return list(self.ja_word_tokenizer.getWS(text))
+
+    @property
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.vocab_size
+    def vocab_size(self):
+        return len(self.encoder)
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_vocab
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.bpe
+    def bpe(self, token):
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + "</w>"
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        if word == "\n  </w>":
+            word = "\n</w>"
+        self.cache[token] = word
+        return word
+
+    def preprocess_text(self, text):
+        text = text.replace("``", '"').replace("''", '"')
+        text = convert_to_unicode(text)
+        text = unicodedata.normalize("NFC", text)
+
+        if self.do_lowercase:
+            text = text.lower()
+
+        return text
+
+    def _tokenize(self, text, bypass_tokenizer=False):
+        """
+        Tokenize a string given language code using Moses.
+
+        Details of tokenization:
+
+            - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
+            - Install with `pip install sacremoses`
+
+        Args:
+            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
+              (bool). If True, we only apply BPE.
+
+        Returns:
+            List of tokens.
+        """
+        lang = "fr"
+        if lang and self.lang2id and lang not in self.lang2id:
+            logger.error(
+                "Supplied language code not found in lang2id mapping. Please check that your language is supported by"
+                " the loaded pretrained model."
+            )
+
+        if bypass_tokenizer:
+            text = text.split()
+        else:
+            text = self.preprocess_text(text)
+            text = self.moses_pipeline(text, lang=lang)
+            text = self.moses_tokenize(text, lang=lang)
+
+        split_tokens = []
+        for token in text:
+            if token:
+                split_tokens.extend(list(self.bpe(token).split(" ")))
+
+        return split_tokens
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.convert_tokens_to_string
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = "".join(tokens).replace("</w>", " ").strip()
+        return out_string
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+
+        """
+        bos = [self.bos_token_id]
+        sep = [self.sep_token_id]
+
+        if token_ids_1 is None:
+            return bos + token_ids_0 + sep
+        return bos + token_ids_0 + sep + token_ids_1 + sep
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__getstate__
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sm"] = None
+        return state
+
+    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__setstate__
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        try:
+            import sacremoses
+        except ImportError:
+            raise ImportError(
+                "You need to install sacremoses to use XLMTokenizer. "
+                "See https://pypi.org/project/sacremoses/ for installation."
+            )
+
+        self.sm = sacremoses
+
+
+__all__ = ["FlaubertTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..292593cb4a201e35a9fd571baec639d9b940e76c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_flava import *
+    from .feature_extraction_flava import *
+    from .image_processing_flava import *
+    from .image_processing_flava_fast import *
+    from .modeling_flava import *
+    from .processing_flava import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/configuration_flava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/configuration_flava.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7bcb920e47acfacadd7a066e773ca2a3d42e2dc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/configuration_flava.py
@@ -0,0 +1,697 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""FLAVA model configurations"""
+
+from typing import Any, Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class FlavaImageConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FlavaImageModel`]. It is used to instantiate an
+    FLAVA model according to the specified arguments, defining the model architecture.
+
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA
+    [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        mask_token (`bool`, *optional*, defaults to `True`):
+            Whether to use a mask token or not. Used in MIM (Masked Image Modeling) loss for FLAVA.
+        vocab_size (`int`, *optional*, defaults to 8192):
+            Vocabulary size of the [`FlavaImageCodebook`] used in conjunction with [`FlavaImageModel`] for MIM (Masked
+            Image Modeling) loss for FLAVA.
+
+    Example:
+
+    ```python
+    >>> from transformers import FlavaImageConfig, FlavaImageModel
+
+    >>> # Initializing a FlavaImageModel with  style configuration
+    >>> configuration = FlavaImageConfig()
+
+    >>> # Initializing a FlavaImageModel model (with random weights) from the style configuration
+    >>> model = FlavaImageModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "flava_image_model"
+    base_config_key = "image_config"
+
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: int = "gelu",
+        hidden_dropout_prob: float = 0.0,
+        attention_probs_dropout_prob: float = 0.0,
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-12,
+        image_size: int = 224,
+        patch_size: int = 16,
+        num_channels: int = 3,
+        qkv_bias: bool = True,
+        mask_token: bool = True,
+        vocab_size: int = 8192,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.mask_token = mask_token
+        self.vocab_size = vocab_size
+
+
+class FlavaTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FlavaTextModel`]. It is used to instantiate an
+    FLAVA model according to the specified arguments, defining the model architecture.
+
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA
+    [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FlavaTextModel`].
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`FlavaTextModel`]. Note that even though
+            text encoder allows `token_type_ids`'s value as 2, for text-only pretraining and fine-tuning, only 1 is
+            used similar to RoBERTa.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048). For VL, max_length passed to model is 77.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+
+    Example:
+
+    ```python
+    >>> from transformers import FlavaTextConfig, FlavaTextModel
+
+    >>> # Initializing a FlavaTextModel with  style configuration
+    >>> configuration = FlavaTextConfig()
+
+    >>> # Initializing a FlavaTextModel model (with random weights) from the style configuration
+    >>> model = FlavaTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "flava_text_model"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size: int = 30522,
+        type_vocab_size: int = 2,
+        max_position_embeddings: int = 512,
+        position_embedding_type: str = "absolute",
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.0,
+        attention_probs_dropout_prob: float = 0.0,
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-12,
+        pad_token_id: int = 0,
+        qkv_bias: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.type_vocab_size = type_vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.position_embedding_type = position_embedding_type
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.pad_token_id = pad_token_id
+
+
+class FlavaMultimodalConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FlavaMultimodalModel`]. It is used to instantiate
+    an FLAVA model according to the specified arguments, defining the model architecture.
+
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA
+    [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        use_cls_token (`bool`, *optional*, defaults to `True`):
+            Whether to use an extra CLS token for multimodal settings. Usually needed by the FLAVA model.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import FlavaMultimodalConfig, FlavaMultimodalModel
+
+    >>> # Initializing a FlavaMultimodalModel with  style configuration
+    >>> configuration = FlavaMultimodalConfig()
+
+    >>> # Initializing a FlavaMultimodalModel model (with random weights) from the style configuration
+    >>> model = FlavaMultimodalModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "flava_multimodal_model"
+    base_config_key = "multimodal_config"
+
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 6,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: int = "gelu",
+        hidden_dropout_prob: int = 0.0,
+        attention_probs_dropout_prob: int = 0.0,
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-12,
+        qkv_bias: bool = True,
+        use_cls_token: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.use_cls_token = use_cls_token
+
+
+class FlavaImageCodebookConfig(PretrainedConfig):
+    model_type = "flava_image_codebook"
+    base_config_key = "image_codebook_config"
+
+    r"""
+    [`FlavaImageCodebookConfig`] is the configuration class to store the configuration of a [`FlavaImageCodebook`]. It
+    is used to instantiate an FLAVA model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the FLAVA
+    [facebook/flava-image-codebook](https://huggingface.co/facebook/flava-image-codebook) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_groups (`int`, *optional*, defaults to 4):
+            Number of groups to be created. This parameter as of now doesn't affect the model and is used for some
+            internal calculation and estimations.
+        input_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the image to be passed.
+        num_blocks_per_group (`int`, *optional*, defaults to 2):
+            Number of conv-based blocks per group.
+        hidden_size (`int`, *optional*, defaults to 256):
+            Size of hidden dim for the blocks.
+        vocab_size (`int`, *optional*, defaults to 8192):
+            Size of the output vocabulary for the codebook.
+        freeze (`bool`, defaults to `True`):
+            Whether to freeze the weights of the model.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import FlavaImageCodebookConfig, FlavaImageCodebook
+
+    >>> # Initializing a FlavaImageCodebook with style configuration
+    >>> configuration = FlavaImageCodebookConfig()
+
+    >>> # Initializing a FlavaImageCodebook model (with random weights) from the style configuration
+    >>> model = FlavaImageCodebook(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    def __init__(
+        self,
+        num_groups: int = 4,
+        input_channels: int = 3,
+        num_blocks_per_group: int = 2,
+        hidden_size: int = 256,
+        vocab_size: int = 8192,
+        freeze: int = True,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_groups = num_groups
+        self.input_channels = input_channels
+        self.num_blocks_per_group = num_blocks_per_group
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.freeze = freeze
+        self.initializer_range = initializer_range
+
+
+class FlavaConfig(PretrainedConfig):
+    r"""
+    [`FlavaConfig`] is the configuration class to store the configuration of a [`FlavaModel`]. It is used to
+    instantiate FLAVA model according to the specified arguments, defining the text model, image model, image codebook
+    and multimodal model configs. Instantiating a configuration with the defaults will yield a similar configuration to
+    that of the FLAVA [facebook/flava-full](https://huggingface.co/facebook/flava-full) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`FlavaTextConfig`].
+        image_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`FlavaImageConfig`].
+        multimodal_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`FlavaMultimodalConfig`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and image projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The initial value of the *logit_scale* parameter. Default is used as per the original FLAVA/CLIP
+            implementation.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        ce_ignore_index (`int`, *optional*, defaults to -100):
+            Cross entropy index to ignore.
+        mim_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to MIM (Masked Image Modeling) unimodal loss
+        mlm_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to MLM (Masked Language Modeling) unimodal loss
+        global_contrastive_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to global contrastive cross-alignment loss.
+        itm_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to image-text matching multimodal loss.
+        mmm_image_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to MMM loss's image part.
+        mmm_text_weight (`float`, *optional*, defaults to 1.0):
+            Weight to be assigned to MMM loss's text part.
+        global_backprop_contrastive (`bool`, *optional*, defaults to `True`):
+            Whether to use global backpropgation through all workers in contrastive loss.
+        skip_unmasked_multimodal_encoder (`bool`, *optional*, defaults to `True`):
+            Whether to skip running unmasked multimodal encoder whose outputs are not used by FLAVA losses.
+        return_loss (`bool`, *optional*, defaults to `True`):
+            Whether to return loss or not
+
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import FlavaConfig, FlavaModel, FlavaForPreTraining
+
+    >>> # Initializing a FlavaConfig with style configuration
+    >>> configuration = FlavaConfig()
+
+    >>> # Initializing a FlavaModel and FlavaForPreTraining model (with random weights) from the style configuration
+    >>> model = FlavaModel(configuration)
+    >>> model_pre = FlavaForPreTraining(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> configuration_pre = model_pre.config
+    ```
+    """
+
+    model_type = "flava"
+    sub_configs = {
+        "text_config": FlavaTextConfig,
+        "image_config": FlavaImageConfig,
+        "multimodal_config": FlavaMultimodalConfig,
+        "image_codebook_config": FlavaImageCodebookConfig,
+    }
+
+    def __init__(
+        self,
+        image_config: Optional[dict[str, Any]] = None,
+        text_config: Optional[dict[str, Any]] = None,
+        multimodal_config: Optional[dict[str, Any]] = None,
+        image_codebook_config: Optional[dict[str, Any]] = None,
+        hidden_size: int = 768,
+        layer_norm_eps: float = 1e-12,
+        projection_dim: int = 768,
+        init_codebook: bool = True,
+        logit_scale_init_value: float = 2.6592,
+        initializer_range: float = 0.02,
+        ce_ignore_index: int = -100,
+        mim_weight: float = 1.0,
+        mlm_weight: float = 1.0,
+        global_contrastive_weight: float = 1.0,
+        itm_weight: float = 1.0,
+        mmm_image_weight: float = 1.0,
+        mmm_text_weight: float = 1.0,
+        global_backprop_contrastive: bool = True,
+        skip_unmasked_multimodal_encoder: bool = True,
+        return_loss: bool = True,
+        **kwargs,
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        image_config_dict = kwargs.pop("image_config_dict", None)
+        multimodal_config_dict = kwargs.pop("multimodal_config_dict", None)
+        image_codebook_config_dict = kwargs.pop("image_codebook_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = FlavaTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key != "transformers_version":
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `FlavaTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
+        if image_config_dict is not None:
+            if image_config is None:
+                image_config = {}
+
+            # This is the complete result when using `image_config_dict`.
+            _image_config_dict = FlavaImageConfig(**image_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _image_config_dict:
+                _image_config_dict["id2label"] = {
+                    str(key): value for key, value in _image_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_image_config_dict` and `image_config` but being different.
+            for key, value in _image_config_dict.items():
+                if key in image_config and value != image_config[key] and key != "transformers_version":
+                    # If specified in `image_config_dict`
+                    if key in image_config_dict:
+                        message = (
+                            f"`{key}` is found in both `image_config_dict` and `image_config` but with different "
+                            f'values. The value `image_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`image_config_dict` is provided which will be used to initialize `FlavaImageConfig`. "
+                            f'The value `image_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `image_config` with the ones in `_image_config_dict`.
+            image_config.update(_image_config_dict)
+
+        if multimodal_config_dict is not None:
+            if multimodal_config is None:
+                multimodal_config = {}
+
+            # This is the complete result when using `multimodal_config_dict`.
+            _multimodal_config_dict = FlavaMultimodalConfig(**multimodal_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_multimodal_config_dict` and `multimodal_config` but being
+            # different.
+            for key, value in _multimodal_config_dict.items():
+                if key in multimodal_config and value != multimodal_config[key] and key != "transformers_version":
+                    # If specified in `multimodal_config_dict`
+                    if key in multimodal_config_dict:
+                        message = (
+                            f"`{key}` is found in both `multimodal_config_dict` and `multimodal_config` but with "
+                            f'different values. The value `multimodal_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`multimodal_config_dict` is provided which will be used to initialize "
+                            f'`FlavaMultimodalConfig`. The value `multimodal_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `multimodal_config` with the ones in `_multimodal_config_dict`.
+            multimodal_config.update(_multimodal_config_dict)
+
+        if image_codebook_config_dict is not None:
+            if image_codebook_config is None:
+                image_codebook_config = {}
+
+            # This is the complete result when using `image_codebook_config_dict`.
+            _image_codebook_config_dict = FlavaImageCodebookConfig(**image_codebook_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_image_codebook_config_dict` and `image_codebook_config` but
+            # being different.
+            for key, value in _image_codebook_config_dict.items():
+                if (
+                    key in image_codebook_config
+                    and value != image_codebook_config[key]
+                    and key != "transformers_version"
+                ):
+                    # If specified in `image_codebook_config_dict`
+                    if key in image_codebook_config_dict:
+                        message = (
+                            f"`{key}` is found in both `image_codebook_config_dict` and `image_codebook_config` but "
+                            f'with different values. The value `image_codebook_config_dict["{key}"]` will be used '
+                            "instead."
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`image_codebook_config_dict` is provided which will be used to initialize "
+                            f'`FlavaImageCodebookConfig`. The value `image_codebook_config["{key}"]` will be overridden.'
+                        )
+                    logger.info(message)
+
+            # Update all values in `image_codebook_config` with the ones in `_image_codebook_config_dict`.
+            image_codebook_config.update(_image_codebook_config_dict)
+
+        if image_config is None:
+            image_config = {}
+            logger.info("`image_config` is `None`. initializing the `FlavaImageConfig` with default values.")
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `FlavaTextConfig` with default values.")
+
+        if multimodal_config is None:
+            multimodal_config = {}
+            logger.info("`multimodal_config` is `None`. initializing the `FlavaMultimodalConfig` with default values.")
+
+        if image_codebook_config is None:
+            image_codebook_config = {}
+            logger.info(
+                "`image_codebook_config` is `None`. initializing the `FlavaImageCodebookConfig` with default values."
+            )
+
+        self.image_config = FlavaImageConfig(**image_config)
+        self.text_config = FlavaTextConfig(**text_config)
+        self.multimodal_config = FlavaMultimodalConfig(**multimodal_config)
+        self.image_codebook_config = FlavaImageCodebookConfig(**image_codebook_config)
+        self.projection_dim = projection_dim
+        self.init_codebook = init_codebook
+
+        self.hidden_size = hidden_size
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+        self.ce_ignore_index = ce_ignore_index
+        self.mim_weight = mim_weight
+        self.mlm_weight = mlm_weight
+        self.global_contrastive_weight = global_contrastive_weight
+        self.itm_weight = itm_weight
+        self.mmm_image_weight = mmm_image_weight
+        self.mmm_text_weight = mmm_text_weight
+        self.global_backprop_contrastive = global_backprop_contrastive
+        self.skip_unmasked_multimodal_encoder = skip_unmasked_multimodal_encoder
+        self.return_loss = return_loss
+
+    @classmethod
+    def from_configs(
+        cls,
+        image_config: FlavaImageConfig,
+        text_config: FlavaTextConfig,
+        multimodal_config: FlavaMultimodalConfig,
+        image_codebook_config: FlavaImageCodebookConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`FlavaConfig`] (or a derived class) from flava text model configuration, flava image model
+        configuration, flava multimodal model and flava codebook model configuration.
+
+        Returns:
+            [`FlavaConfig`]: An instance of a configuration object
+        """
+
+        return cls(
+            image_config=image_config.to_dict(),
+            text_config=text_config.to_dict(),
+            multimodal_config=multimodal_config.to_dict(),
+            image_codebook_config=image_codebook_config.to_dict(),
+            **kwargs,
+        )
+
+
+__all__ = ["FlavaConfig", "FlavaImageCodebookConfig", "FlavaImageConfig", "FlavaMultimodalConfig", "FlavaTextConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/feature_extraction_flava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/feature_extraction_flava.py
new file mode 100644
index 0000000000000000000000000000000000000000..19bcccc889f546442af71229c998880fbbb2db31
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/feature_extraction_flava.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for FLAVA."""
+
+import warnings
+
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_flava import FlavaImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class FlavaFeatureExtractor(FlavaImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class FlavaFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
+            " use FlavaImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["FlavaFeatureExtractor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/image_processing_flava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/image_processing_flava.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b4db246a8fa4195efd8d9840e351aae43af2a53
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/image_processing_flava.py
@@ -0,0 +1,706 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Flava."""
+
+import math
+import random
+from collections.abc import Iterable
+from functools import lru_cache
+from typing import Any, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import resize, to_channel_dimension_format
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+from ...utils.import_utils import requires
+
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+# These values are taken from CLIP
+FLAVA_IMAGE_MEAN = OPENAI_CLIP_MEAN
+FLAVA_IMAGE_STD = OPENAI_CLIP_STD
+FLAVA_CODEBOOK_MEAN = [0.0, 0.0, 0.0]
+FLAVA_CODEBOOK_STD = [1.0, 1.0, 1.0]
+LOGIT_LAPLACE_EPS: float = 0.1
+
+
+# Inspired from https://github.com/microsoft/unilm/blob/master/beit/masking_generator.py
+class FlavaMaskingGenerator:
+    def __init__(
+        self,
+        input_size: Union[int, tuple[int, int]] = 14,
+        total_mask_patches: int = 75,
+        mask_group_max_patches: Optional[int] = None,
+        mask_group_min_patches: int = 16,
+        mask_group_min_aspect_ratio: Optional[float] = 0.3,
+        mask_group_max_aspect_ratio: Optional[float] = None,
+    ):
+        if not isinstance(input_size, tuple):
+            input_size = (input_size,) * 2
+        self.height, self.width = input_size
+
+        self.num_patches = self.height * self.width
+        self.total_mask_patches = total_mask_patches
+
+        self.mask_group_min_patches = mask_group_min_patches
+        self.mask_group_max_patches = total_mask_patches if mask_group_max_patches is None else mask_group_max_patches
+
+        mask_group_max_aspect_ratio = mask_group_max_aspect_ratio or 1 / mask_group_min_aspect_ratio
+        self.log_aspect_ratio = (math.log(mask_group_min_aspect_ratio), math.log(mask_group_max_aspect_ratio))
+
+    def __repr__(self):
+        repr_str = "MaskingGenerator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
+            self.height,
+            self.width,
+            self.mask_group_min_patches,
+            self.mask_group_max_patches,
+            self.total_mask_patches,
+            self.log_aspect_ratio[0],
+            self.log_aspect_ratio[1],
+        )
+        return repr_str
+
+    def get_shape(self):
+        return self.height, self.width
+
+    def _mask(self, mask, max_mask_patches):
+        delta = 0
+        for _attempt in range(10):
+            target_area = random.uniform(self.mask_group_min_patches, max_mask_patches)
+            aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+            height = int(round(math.sqrt(target_area * aspect_ratio)))
+            width = int(round(math.sqrt(target_area / aspect_ratio)))
+            if width < self.width and height < self.height:
+                top = random.randint(0, self.height - height)
+                left = random.randint(0, self.width - width)
+
+                num_masked = mask[top : top + height, left : left + width].sum()
+                # Overlap
+                if 0 < height * width - num_masked <= max_mask_patches:
+                    for i in range(top, top + height):
+                        for j in range(left, left + width):
+                            if mask[i, j] == 0:
+                                mask[i, j] = 1
+                                delta += 1
+
+                if delta > 0:
+                    break
+        return delta
+
+    def __call__(self):
+        mask = np.zeros(shape=self.get_shape(), dtype=int)
+        mask_count = 0
+        while mask_count < self.total_mask_patches:
+            max_mask_patches = self.total_mask_patches - mask_count
+            max_mask_patches = min(max_mask_patches, self.mask_group_max_patches)
+
+            delta = self._mask(mask, max_mask_patches)
+            if delta == 0:
+                break
+            else:
+                mask_count += delta
+
+        return mask
+
+
+@requires(backends=("vision",))
+class FlavaImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Flava image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in `preprocess`.
+        size (`dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after resizing. Can be overridden by the `size` parameter in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in
+            `preprocess`.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the images. Can be overridden by the `do_center_crop` parameter in `preprocess`.
+        crop_size (`dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of image after the center crop `(crop_size["height"], crop_size["width"])`. Can be overridden by the
+            `crop_size` parameter in `preprocess`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in `preprocess`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in
+            `preprocess`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in `preprocess`.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        return_image_mask (`bool`, *optional*, defaults to `False`):
+            Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
+        input_size_patches (`int`, *optional*, defaults to 14):
+            Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
+            by the `input_size_patches` parameter in `preprocess`.
+        total_mask_patches (`int`, *optional*, defaults to 75):
+            Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
+            `preprocess`.
+        mask_group_min_patches (`int`, *optional*, defaults to 16):
+            Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
+            parameter in `preprocess`.
+        mask_group_max_patches (`int`, *optional*):
+            Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
+            parameter in `preprocess`.
+        mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
+            Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
+            in `preprocess`.
+        mask_group_max_aspect_ratio (`float`, *optional*):
+            Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
+            in `preprocess`.
+        codebook_do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
+            parameter in `preprocess`. `codebook_size`.
+        codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
+            `preprocess`.
+        codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+            Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
+            parameter in `preprocess`.
+        codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to crop the input for codebook at the center. If the input size is smaller than
+            `codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
+            overridden by the `codebook_do_center_crop` parameter in `preprocess`.
+        codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Desired output size for codebook input when applying center-cropping. Can be overridden by the
+            `codebook_crop_size` parameter in `preprocess`.
+        codebook_do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
+            overridden by the `codebook_do_rescale` parameter in `preprocess`.
+        codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
+            `codebook_rescale_factor` parameter in `preprocess`.
+        codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
+            Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
+            `codebook_do_map_pixels` parameter in `preprocess`.
+        codebook_do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
+            be overridden by the `codebook_do_normalize` parameter in `preprocess`.
+        codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
+            The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
+            by the `codebook_image_mean` parameter in `preprocess`.
+        codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
+            be overridden by the `codebook_image_std` parameter in `preprocess`.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, Iterable[float]]] = None,
+        image_std: Optional[Union[float, Iterable[float]]] = None,
+        # Mask related params
+        return_image_mask: bool = False,
+        input_size_patches: int = 14,
+        total_mask_patches: int = 75,
+        mask_group_min_patches: int = 16,
+        mask_group_max_patches: Optional[int] = None,
+        mask_group_min_aspect_ratio: float = 0.3,
+        mask_group_max_aspect_ratio: Optional[float] = None,
+        # Codebook related params
+        return_codebook_pixels: bool = False,
+        codebook_do_resize: bool = True,
+        codebook_size: Optional[bool] = None,
+        codebook_resample: int = PILImageResampling.LANCZOS,
+        codebook_do_center_crop: bool = True,
+        codebook_crop_size: Optional[int] = None,
+        codebook_do_rescale: bool = True,
+        codebook_rescale_factor: Union[int, float] = 1 / 255,
+        codebook_do_map_pixels: bool = True,
+        codebook_do_normalize: bool = True,
+        codebook_image_mean: Optional[Union[float, Iterable[float]]] = None,
+        codebook_image_std: Optional[Union[float, Iterable[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        codebook_size = codebook_size if codebook_size is not None else {"height": 112, "width": 112}
+        codebook_size = get_size_dict(codebook_size, param_name="codebook_size")
+        codebook_crop_size = codebook_crop_size if codebook_crop_size is not None else {"height": 112, "width": 112}
+        codebook_crop_size = get_size_dict(codebook_crop_size, param_name="codebook_crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else FLAVA_IMAGE_MEAN
+        self.image_std = image_std if image_std is not None else FLAVA_IMAGE_STD
+
+        self.return_image_mask = return_image_mask
+        self.input_size_patches = input_size_patches
+        self.total_mask_patches = total_mask_patches
+        self.mask_group_min_patches = mask_group_min_patches
+        self.mask_group_max_patches = mask_group_max_patches
+        self.mask_group_min_aspect_ratio = mask_group_min_aspect_ratio
+        self.mask_group_max_aspect_ratio = mask_group_max_aspect_ratio
+
+        self.return_codebook_pixels = return_codebook_pixels
+        self.codebook_do_resize = codebook_do_resize
+        self.codebook_size = codebook_size
+        self.codebook_resample = codebook_resample
+        self.codebook_do_center_crop = codebook_do_center_crop
+        self.codebook_crop_size = codebook_crop_size
+        self.codebook_do_rescale = codebook_do_rescale
+        self.codebook_rescale_factor = codebook_rescale_factor
+        self.codebook_do_map_pixels = codebook_do_map_pixels
+        self.codebook_do_normalize = codebook_do_normalize
+        self.codebook_image_mean = codebook_image_mean
+        self.codebook_image_mean = codebook_image_mean if codebook_image_mean is not None else FLAVA_CODEBOOK_MEAN
+        self.codebook_image_std = codebook_image_std if codebook_image_std is not None else FLAVA_CODEBOOK_STD
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `FlavaImageProcessor.from_pretrained(checkpoint, codebook_size=600)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "codebook_size" in kwargs:
+            image_processor_dict["codebook_size"] = kwargs.pop("codebook_size")
+        if "codebook_crop_size" in kwargs:
+            image_processor_dict["codebook_crop_size"] = kwargs.pop("codebook_crop_size")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    @lru_cache
+    def masking_generator(
+        self,
+        input_size_patches,
+        total_mask_patches,
+        mask_group_min_patches,
+        mask_group_max_patches,
+        mask_group_min_aspect_ratio,
+        mask_group_max_aspect_ratio,
+    ) -> FlavaMaskingGenerator:
+        return FlavaMaskingGenerator(
+            input_size=input_size_patches,
+            total_mask_patches=total_mask_patches,
+            mask_group_min_patches=mask_group_min_patches,
+            mask_group_max_patches=mask_group_max_patches,
+            mask_group_min_aspect_ratio=mask_group_min_aspect_ratio,
+            mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
+        )
+
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def map_pixels(self, image: np.ndarray) -> np.ndarray:
+        return (1 - 2 * LOGIT_LAPLACE_EPS) * image + LOGIT_LAPLACE_EPS
+
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_map_pixels: Optional[bool] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[ChannelDimension] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+
+        if do_rescale and is_scaled_image(image):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(image)
+
+        if do_resize:
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+        if do_center_crop:
+            image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
+
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+        if do_normalize:
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+
+        if do_map_pixels:
+            image = self.map_pixels(image)
+
+        if data_format is not None:
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        return image
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        # Mask related params
+        return_image_mask: Optional[bool] = None,
+        input_size_patches: Optional[int] = None,
+        total_mask_patches: Optional[int] = None,
+        mask_group_min_patches: Optional[int] = None,
+        mask_group_max_patches: Optional[int] = None,
+        mask_group_min_aspect_ratio: Optional[float] = None,
+        mask_group_max_aspect_ratio: Optional[float] = None,
+        # Codebook related params
+        return_codebook_pixels: Optional[bool] = None,
+        codebook_do_resize: Optional[bool] = None,
+        codebook_size: Optional[dict[str, int]] = None,
+        codebook_resample: Optional[int] = None,
+        codebook_do_center_crop: Optional[bool] = None,
+        codebook_crop_size: Optional[dict[str, int]] = None,
+        codebook_do_rescale: Optional[bool] = None,
+        codebook_rescale_factor: Optional[float] = None,
+        codebook_do_map_pixels: Optional[bool] = None,
+        codebook_do_normalize: Optional[bool] = None,
+        codebook_image_mean: Optional[Iterable[float]] = None,
+        codebook_image_std: Optional[Iterable[float]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            return_image_mask (`bool`, *optional*, defaults to `self.return_image_mask`):
+                Whether to return the image mask.
+            input_size_patches (`int`, *optional*, defaults to `self.input_size_patches`):
+                Size of the patches to extract from the image.
+            total_mask_patches (`int`, *optional*, defaults to `self.total_mask_patches`):
+                Total number of patches to extract from the image.
+            mask_group_min_patches (`int`, *optional*, defaults to `self.mask_group_min_patches`):
+                Minimum number of patches to extract from the image.
+            mask_group_max_patches (`int`, *optional*, defaults to `self.mask_group_max_patches`):
+                Maximum number of patches to extract from the image.
+            mask_group_min_aspect_ratio (`float`, *optional*, defaults to `self.mask_group_min_aspect_ratio`):
+                Minimum aspect ratio of the patches to extract from the image.
+            mask_group_max_aspect_ratio (`float`, *optional*, defaults to `self.mask_group_max_aspect_ratio`):
+                Maximum aspect ratio of the patches to extract from the image.
+            return_codebook_pixels (`bool`, *optional*, defaults to `self.return_codebook_pixels`):
+                Whether to return the codebook pixels.
+            codebook_do_resize (`bool`, *optional*, defaults to `self.codebook_do_resize`):
+                Whether to resize the codebook pixels.
+            codebook_size (`dict[str, int]`, *optional*, defaults to `self.codebook_size`):
+                Size of the codebook pixels.
+            codebook_resample (`int`, *optional*, defaults to `self.codebook_resample`):
+                Resampling filter to use if resizing the codebook pixels. This can be one of the enum
+                `PILImageResampling`, Only has an effect if `codebook_do_resize` is set to `True`.
+            codebook_do_center_crop (`bool`, *optional*, defaults to `self.codebook_do_center_crop`):
+                Whether to center crop the codebook pixels.
+            codebook_crop_size (`dict[str, int]`, *optional*, defaults to `self.codebook_crop_size`):
+                Size of the center crop of the codebook pixels. Only has an effect if `codebook_do_center_crop` is set
+                to `True`.
+            codebook_do_rescale (`bool`, *optional*, defaults to `self.codebook_do_rescale`):
+                Whether to rescale the codebook pixels values between [0 - 1].
+            codebook_rescale_factor (`float`, *optional*, defaults to `self.codebook_rescale_factor`):
+                Rescale factor to rescale the codebook pixels by if `codebook_do_rescale` is set to `True`.
+            codebook_do_map_pixels (`bool`, *optional*, defaults to `self.codebook_do_map_pixels`):
+                Whether to map the codebook pixels values.
+            codebook_do_normalize (`bool`, *optional*, defaults to `self.codebook_do_normalize`):
+                Whether to normalize the codebook pixels.
+            codebook_image_mean (`float` or `list[float]`, *optional*, defaults to `self.codebook_image_mean`):
+                Codebook pixels mean to normalize the codebook pixels by if `codebook_do_normalize` is set to `True`.
+            codebook_image_std (`float` or `list[float]`, *optional*, defaults to `self.codebook_image_std`):
+                Codebook pixels standard deviation to normalize the codebook pixels by if `codebook_do_normalize` is
+                set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        return_image_mask = return_image_mask if return_image_mask is not None else self.return_image_mask
+        input_size_patches = input_size_patches if input_size_patches is not None else self.input_size_patches
+        total_mask_patches = total_mask_patches if total_mask_patches is not None else self.total_mask_patches
+        mask_group_min_patches = (
+            mask_group_min_patches if mask_group_min_patches is not None else self.mask_group_min_patches
+        )
+        mask_group_max_patches = (
+            mask_group_max_patches if mask_group_max_patches is not None else self.mask_group_max_patches
+        )
+        mask_group_min_aspect_ratio = (
+            mask_group_min_aspect_ratio
+            if mask_group_min_aspect_ratio is not None
+            else self.mask_group_min_aspect_ratio
+        )
+        mask_group_max_aspect_ratio = (
+            mask_group_max_aspect_ratio
+            if mask_group_max_aspect_ratio is not None
+            else self.mask_group_max_aspect_ratio
+        )
+
+        return_codebook_pixels = (
+            return_codebook_pixels if return_codebook_pixels is not None else self.return_codebook_pixels
+        )
+        codebook_do_resize = codebook_do_resize if codebook_do_resize is not None else self.codebook_do_resize
+        codebook_size = codebook_size if codebook_size is not None else self.codebook_size
+        codebook_size = get_size_dict(codebook_size, param_name="codebook_size")
+        codebook_resample = codebook_resample if codebook_resample is not None else self.codebook_resample
+        codebook_do_rescale = codebook_do_rescale if codebook_do_rescale is not None else self.codebook_do_rescale
+        codebook_rescale_factor = (
+            codebook_rescale_factor if codebook_rescale_factor is not None else self.codebook_rescale_factor
+        )
+        codebook_do_center_crop = (
+            codebook_do_center_crop if codebook_do_center_crop is not None else self.codebook_do_center_crop
+        )
+        codebook_crop_size = codebook_crop_size if codebook_crop_size is not None else self.codebook_crop_size
+        codebook_crop_size = get_size_dict(codebook_crop_size, param_name="codebook_crop_size")
+        codebook_do_map_pixels = (
+            codebook_do_map_pixels if codebook_do_map_pixels is not None else self.codebook_do_map_pixels
+        )
+        codebook_do_normalize = (
+            codebook_do_normalize if codebook_do_normalize is not None else self.codebook_do_normalize
+        )
+        codebook_image_mean = codebook_image_mean if codebook_image_mean is not None else self.codebook_image_mean
+        codebook_image_std = codebook_image_std if codebook_image_std is not None else self.codebook_image_std
+
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        processed_images = [
+            self._preprocess_image(
+                image=img,
+                do_resize=do_resize,
+                size=size,
+                resample=resample,
+                do_center_crop=do_center_crop,
+                crop_size=crop_size,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                do_map_pixels=False,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for img in images
+        ]
+        data = {"pixel_values": processed_images}
+
+        if return_codebook_pixels:
+            codebook_images = [
+                self._preprocess_image(
+                    image=img,
+                    do_resize=codebook_do_resize,
+                    size=codebook_size,
+                    resample=codebook_resample,
+                    do_center_crop=codebook_do_center_crop,
+                    crop_size=codebook_crop_size,
+                    do_rescale=codebook_do_rescale,
+                    rescale_factor=codebook_rescale_factor,
+                    do_normalize=codebook_do_normalize,
+                    image_mean=codebook_image_mean,
+                    image_std=codebook_image_std,
+                    do_map_pixels=codebook_do_map_pixels,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                for img in images
+            ]
+            data["codebook_pixel_values"] = codebook_images
+
+        if return_image_mask:
+            mask_generator = self.masking_generator(
+                input_size_patches=input_size_patches,
+                total_mask_patches=total_mask_patches,
+                mask_group_min_patches=mask_group_min_patches,
+                mask_group_max_patches=mask_group_max_patches,
+                mask_group_min_aspect_ratio=mask_group_min_aspect_ratio,
+                mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
+            )
+            masks = [mask_generator() for _ in images]
+            data["bool_masked_pos"] = masks
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["FlavaImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/image_processing_flava_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/image_processing_flava_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..732d25e71f697e083e784199c1db1782298951ac
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/image_processing_flava_fast.py
@@ -0,0 +1,491 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for Flava."""
+
+import math
+import random
+from collections.abc import Iterable
+from functools import lru_cache
+from typing import Any, Optional, Union
+
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    BatchFeature,
+    DefaultFastImageProcessorKwargs,
+    get_size_dict,
+)
+from ...image_transforms import ChannelDimension, group_images_by_shape, reorder_images
+from ...image_utils import ImageInput, PILImageResampling, SizeDict, pil_torch_interpolation_mapping
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+)
+from .image_processing_flava import (
+    FLAVA_CODEBOOK_MEAN,
+    FLAVA_CODEBOOK_STD,
+    FLAVA_IMAGE_MEAN,
+    FLAVA_IMAGE_STD,
+    LOGIT_LAPLACE_EPS,
+)
+
+
+class FlavaMaskingGenerator:
+    def __init__(
+        self,
+        input_size: Union[int, tuple[int, int]] = 14,
+        total_mask_patches: int = 75,
+        mask_group_max_patches: Optional[int] = None,
+        mask_group_min_patches: int = 16,
+        mask_group_min_aspect_ratio: Optional[float] = 0.3,
+        mask_group_max_aspect_ratio: Optional[float] = None,
+    ):
+        if not isinstance(input_size, tuple):
+            input_size = (input_size,) * 2
+        self.height, self.width = input_size
+
+        self.num_patches = self.height * self.width
+        self.total_mask_patches = total_mask_patches
+
+        self.mask_group_min_patches = mask_group_min_patches
+        self.mask_group_max_patches = total_mask_patches if mask_group_max_patches is None else mask_group_max_patches
+
+        mask_group_max_aspect_ratio = mask_group_max_aspect_ratio or 1 / mask_group_min_aspect_ratio
+        self.log_aspect_ratio = (math.log(mask_group_min_aspect_ratio), math.log(mask_group_max_aspect_ratio))
+
+    def __repr__(self):
+        repr_str = "MaskingGenerator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
+            self.height,
+            self.width,
+            self.mask_group_min_patches,
+            self.mask_group_max_patches,
+            self.total_mask_patches,
+            self.log_aspect_ratio[0],
+            self.log_aspect_ratio[1],
+        )
+        return repr_str
+
+    def get_shape(self):
+        return self.height, self.width
+
+    def _mask(self, mask, max_mask_patches):
+        delta = 0
+        for _attempt in range(10):
+            target_area = random.uniform(self.mask_group_min_patches, max_mask_patches)
+            aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+            height = int(round(math.sqrt(target_area * aspect_ratio)))
+            width = int(round(math.sqrt(target_area / aspect_ratio)))
+            if width < self.width and height < self.height:
+                top = random.randint(0, self.height - height)
+                left = random.randint(0, self.width - width)
+
+                num_masked = mask[top : top + height, left : left + width].sum()
+                # Overlap
+                if 0 < height * width - num_masked <= max_mask_patches:
+                    zeros_pos = mask[top : top + height, left : left + width] == 0
+                    mask[top : top + height, left : left + width][zeros_pos] = 1
+                    delta += zeros_pos.sum()
+
+                if delta > 0:
+                    break
+        return delta
+
+    def __call__(self):
+        mask = torch.zeros(self.get_shape(), dtype=torch.int)
+        mask_count = 0
+        while mask_count < self.total_mask_patches:
+            max_mask_patches = self.total_mask_patches - mask_count
+            max_mask_patches = min(max_mask_patches, self.mask_group_max_patches)
+
+            delta = self._mask(mask, max_mask_patches)
+            if delta == 0:
+                break
+            else:
+                mask_count += delta
+
+        return mask
+
+
+class FlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    Args:
+        return_image_mask (`bool`, *optional*, defaults to `False`):
+            Whether to return the image mask. Can be overridden by the `return_image_mask` parameter in `preprocess`.
+        input_size_patches (`int`, *optional*, defaults to 14):
+            Number of patches in the image in height and width direction. 14x14 = 196 total patches. Can be overridden
+            by the `input_size_patches` parameter in `preprocess`.
+        total_mask_patches (`int`, *optional*, defaults to 75):
+            Total number of patches that should be masked. Can be overridden by the `total_mask_patches` parameter in
+            `preprocess`.
+        mask_group_min_patches (`int`, *optional*, defaults to 16):
+            Minimum number of patches that should be masked. Can be overridden by the `mask_group_min_patches`
+            parameter in `preprocess`.
+        mask_group_max_patches (`int`, *optional*):
+            Maximum number of patches that should be masked. Can be overridden by the `mask_group_max_patches`
+            parameter in `preprocess`.
+        mask_group_min_aspect_ratio (`float`, *optional*, defaults to 0.3):
+            Minimum aspect ratio of the mask window. Can be overridden by the `mask_group_min_aspect_ratio` parameter
+            in `preprocess`.
+        mask_group_max_aspect_ratio (`float`, *optional*):
+            Maximum aspect ratio of the mask window. Can be overridden by the `mask_group_max_aspect_ratio` parameter
+            in `preprocess`.
+        return_codebook_pixels (`bool`, *optional*, defaults to `False`):
+            Whether to return the codebook pixel values.
+        codebook_do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input for codebook to a certain. Can be overridden by the `codebook_do_resize`
+            parameter in `preprocess`. `codebook_size`.
+        codebook_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Resize the input for codebook to the given size. Can be overridden by the `codebook_size` parameter in
+            `preprocess`.
+        codebook_resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.LANCZOS`):
+            Resampling filter to use if resizing the codebook image. Can be overridden by the `codebook_resample`
+            parameter in `preprocess`.
+        codebook_do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to crop the input for codebook at the center. If the input size is smaller than
+            `codebook_crop_size` along any edge, the image is padded with 0's and then center cropped. Can be
+            overridden by the `codebook_do_center_crop` parameter in `preprocess`.
+        codebook_crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Desired output size for codebook input when applying center-cropping. Can be overridden by the
+            `codebook_crop_size` parameter in `preprocess`.
+        codebook_do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the input for codebook by the specified scale `codebook_rescale_factor`. Can be
+            overridden by the `codebook_do_rescale` parameter in `preprocess`.
+        codebook_rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Defines the scale factor to use if rescaling the codebook image. Can be overridden by the
+            `codebook_rescale_factor` parameter in `preprocess`.
+        codebook_do_map_pixels (`bool`, *optional*, defaults to `True`):
+            Whether to map the pixel values of the codebook input to (1 - 2e)x + e. Can be overridden by the
+            `codebook_do_map_pixels` parameter in `preprocess`.
+        codebook_do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input for codebook with `codebook_image_mean` and `codebook_image_std`. Can
+            be overridden by the `codebook_do_normalize` parameter in `preprocess`.
+        codebook_image_mean (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0, 0, 0]`):
+            The sequence of means for each channel, to be used when normalizing images for codebook. Can be overridden
+            by the `codebook_image_mean` parameter in `preprocess`.
+        codebook_image_std (`Optional[Union[float, Iterable[float]]]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images for codebook. Can
+            be overridden by the `codebook_image_std` parameter in `preprocess`.
+    """
+
+    # Mask related params
+    return_image_mask: Optional[bool]
+    input_size_patches: Optional[int]
+    total_mask_patches: Optional[int]
+    mask_group_min_patches: Optional[int]
+    mask_group_max_patches: Optional[int]
+    mask_group_min_aspect_ratio: Optional[float]
+    mask_group_max_aspect_ratio: Optional[float]
+    # Codebook related params
+    return_codebook_pixels: Optional[bool]
+    codebook_do_resize: Optional[bool]
+    codebook_size: Optional[bool]
+    codebook_resample: Optional[int]
+    codebook_do_center_crop: Optional[bool]
+    codebook_crop_size: Optional[int]
+    codebook_do_rescale: Optional[bool]
+    codebook_rescale_factor: Optional[Union[int, float]]
+    codebook_do_map_pixels: Optional[bool]
+    codebook_do_normalize: Optional[bool]
+    codebook_image_mean: Optional[Union[float, Iterable[float]]]
+    codebook_image_std: Optional[Union[float, Iterable[float]]]
+
+
+@auto_docstring
+class FlavaImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = FLAVA_IMAGE_MEAN
+    image_std = FLAVA_IMAGE_STD
+    size = {"height": 224, "width": 224}
+    crop_size = {"height": 224, "width": 224}
+    do_resize = True
+    do_center_crop = True
+    do_rescale = True
+    do_normalize = True
+
+    # Mask related params
+    return_image_mask = False
+    input_size_patches = 14
+    total_mask_patches = 75
+    mask_group_min_patches = 16
+    mask_group_max_patches = None
+    mask_group_min_aspect_ratio = 0.3
+    mask_group_max_aspect_ratio = None
+    # Codebook related params
+    return_codebook_pixels = False
+    codebook_do_resize = True
+    codebook_size = {"height": 112, "width": 112}
+    # LANCZOS resample does not support torch Tensor. Use BICUBIC as closest alternative
+    codebook_resample = PILImageResampling.BICUBIC
+    codebook_do_center_crop = True
+    codebook_crop_size = {"height": 112, "width": 112}
+    codebook_do_rescale = True
+    codebook_rescale_factor = 1 / 255
+    codebook_do_map_pixels = True
+    codebook_do_normalize = True
+    codebook_image_mean = FLAVA_CODEBOOK_MEAN
+    codebook_image_std = FLAVA_CODEBOOK_STD
+    valid_kwargs = FlavaFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[FlavaFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[DefaultFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
+        """
+        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
+        created using from_dict and kwargs e.g. `FlavaImageProcessor.from_pretrained(checkpoint, codebook_size=600)`
+        """
+        image_processor_dict = image_processor_dict.copy()
+        if "codebook_size" in kwargs:
+            image_processor_dict["codebook_size"] = kwargs.pop("codebook_size")
+        if "codebook_crop_size" in kwargs:
+            image_processor_dict["codebook_crop_size"] = kwargs.pop("codebook_crop_size")
+        return super().from_dict(image_processor_dict, **kwargs)
+
+    @lru_cache
+    def masking_generator(
+        self,
+        input_size_patches,
+        total_mask_patches,
+        mask_group_min_patches,
+        mask_group_max_patches,
+        mask_group_min_aspect_ratio,
+        mask_group_max_aspect_ratio,
+    ) -> FlavaMaskingGenerator:
+        return FlavaMaskingGenerator(
+            input_size=input_size_patches,
+            total_mask_patches=total_mask_patches,
+            mask_group_min_patches=mask_group_min_patches,
+            mask_group_max_patches=mask_group_max_patches,
+            mask_group_min_aspect_ratio=mask_group_min_aspect_ratio,
+            mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
+        )
+
+    def map_pixels(self, image: "torch.Tensor") -> "torch.Tensor":
+        return (1 - 2 * LOGIT_LAPLACE_EPS) * image + LOGIT_LAPLACE_EPS
+
+    def _further_process_kwargs(
+        self,
+        size: Optional[SizeDict] = None,
+        crop_size: Optional[SizeDict] = None,
+        default_to_square: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        codebook_size: Optional[SizeDict] = None,
+        codebook_crop_size: Optional[SizeDict] = None,
+        codebook_image_mean: Optional[Union[float, list[float]]] = None,
+        codebook_image_std: Optional[Union[float, list[float]]] = None,
+        codebook_resample: Optional[PILImageResampling] = None,
+        data_format: Optional[ChannelDimension] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        Update kwargs that need further processing before being validated
+        Can be overridden by subclasses to customize the processing of kwargs.
+        """
+        if kwargs is None:
+            kwargs = {}
+        if size is not None:
+            size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square))
+        if crop_size is not None:
+            crop_size = SizeDict(**get_size_dict(crop_size, param_name="crop_size"))
+        if isinstance(image_mean, list):
+            image_mean = tuple(image_mean)
+        if isinstance(image_std, list):
+            image_std = tuple(image_std)
+        if data_format is None:
+            data_format = ChannelDimension.FIRST
+        if codebook_size is not None:
+            codebook_size = SizeDict(**get_size_dict(size=codebook_size, default_to_square=default_to_square))
+        if codebook_crop_size is not None:
+            codebook_crop_size = SizeDict(**get_size_dict(codebook_crop_size, param_name="codebook_crop_size"))
+        if isinstance(codebook_image_mean, list):
+            codebook_image_mean = tuple(codebook_image_mean)
+        if isinstance(codebook_image_std, list):
+            codebook_image_std = tuple(codebook_image_std)
+
+        kwargs["size"] = size
+        kwargs["crop_size"] = crop_size
+        kwargs["image_mean"] = image_mean
+        kwargs["image_std"] = image_std
+        kwargs["codebook_size"] = codebook_size
+        kwargs["codebook_crop_size"] = codebook_crop_size
+        kwargs["codebook_image_mean"] = codebook_image_mean
+        kwargs["codebook_image_std"] = codebook_image_std
+        kwargs["data_format"] = data_format
+        kwargs["codebook_interpolation"] = (
+            pil_torch_interpolation_mapping[codebook_resample]
+            if isinstance(codebook_resample, (PILImageResampling, int))
+            else codebook_resample
+        )
+
+        # torch resize uses interpolation instead of resample
+        # Check if resample is an int before checking if it's an instance of PILImageResampling
+        # because if pillow < 9.1.0, resample is an int and PILImageResampling is a module.
+        # Checking PILImageResampling will fail with error `TypeError: isinstance() arg 2 must be a type or tuple of types`.
+        resample = kwargs.pop("resample")
+        kwargs["interpolation"] = (
+            pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample
+        )
+
+        return kwargs
+
+    def _preprocess_image(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        do_map_pixels: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> "torch.Tensor":
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            if do_map_pixels:
+                stacked_images = self.map_pixels(image=stacked_images)
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return processed_images
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        # Mask related params
+        return_image_mask: Optional[bool],
+        input_size_patches: Optional[int],
+        total_mask_patches: Optional[int],
+        mask_group_min_patches: Optional[int],
+        mask_group_max_patches: Optional[int],
+        mask_group_min_aspect_ratio: Optional[float],
+        mask_group_max_aspect_ratio: Optional[float],
+        # Codebook related params
+        return_codebook_pixels: Optional[bool],
+        codebook_do_resize: Optional[bool],
+        codebook_size: Optional[SizeDict],
+        codebook_interpolation: Optional["F.InterpolationMode"],
+        codebook_do_center_crop: Optional[bool],
+        codebook_crop_size: Optional[SizeDict],
+        codebook_do_rescale: Optional[bool],
+        codebook_rescale_factor: Optional[float],
+        codebook_do_map_pixels: Optional[bool],
+        codebook_do_normalize: Optional[bool],
+        codebook_image_mean: Optional[Union[float, list[float]]],
+        codebook_image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        processed_images = self._preprocess_image(
+            images=images,
+            do_resize=do_resize,
+            size=size,
+            interpolation=interpolation,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            do_map_pixels=False,
+            image_mean=image_mean,
+            image_std=image_std,
+            disable_grouping=disable_grouping,
+            return_tensors=return_tensors,
+        )
+        data = {
+            "pixel_values": processed_images,
+        }
+
+        if return_codebook_pixels:
+            codebook_processed_images = self._preprocess_image(
+                images=images,
+                do_resize=codebook_do_resize,
+                size=codebook_size,
+                interpolation=codebook_interpolation,
+                do_center_crop=codebook_do_center_crop,
+                crop_size=codebook_crop_size,
+                do_rescale=codebook_do_rescale,
+                rescale_factor=codebook_rescale_factor,
+                do_normalize=codebook_do_normalize,
+                do_map_pixels=codebook_do_map_pixels,
+                image_mean=codebook_image_mean,
+                image_std=codebook_image_std,
+                disable_grouping=disable_grouping,
+                return_tensors=return_tensors,
+            )
+            data["codebook_pixel_values"] = codebook_processed_images
+
+        if return_image_mask:
+            mask_generator = self.masking_generator(
+                input_size_patches=input_size_patches,
+                total_mask_patches=total_mask_patches,
+                mask_group_min_patches=mask_group_min_patches,
+                mask_group_max_patches=mask_group_max_patches,
+                mask_group_min_aspect_ratio=mask_group_min_aspect_ratio,
+                mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
+            )
+            masks = [mask_generator() for _ in range(len(images))]
+            masks = torch.stack(masks, dim=0) if return_tensors else masks
+            data["bool_masked_pos"] = masks
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["FlavaImageProcessorFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/modeling_flava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/modeling_flava.py
new file mode 100644
index 0000000000000000000000000000000000000000..266c3e96af5a263db2b49a31223f9652563ae213
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/modeling_flava.py
@@ -0,0 +1,2030 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch FLAVA model."""
+
+import collections
+import math
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import ModelOutput, auto_docstring, filter_out_non_signature_kwargs, logging, torch_int
+from .configuration_flava import (
+    FlavaConfig,
+    FlavaImageCodebookConfig,
+    FlavaImageConfig,
+    FlavaMultimodalConfig,
+    FlavaTextConfig,
+)
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_CODEBOOK_DOC = "facebook/flava-image-codebook"
+
+LOGIT_SCALE_CLAMP_MIN = 0
+LOGIT_SCALE_CLAMP_MAX = 4.6052
+
+FlavaPossibleConfigs = Union[FlavaTextConfig, FlavaImageConfig, FlavaMultimodalConfig]
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output from FlavaModel containing embeddings and outputs from individual encoders.
+
+    Note that `image_embeddings` and `text_embeddigns` returned are similar to pooled output returned from a
+    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
+    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.
+    """
+)
+class FlavaModelOutput(ModelOutput):
+    r"""
+    image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
+        The image embeddings which are basically the pooled output of [`FlavaImageModel`].
+    image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
+        The output of the [`FlavaImageModel`].
+    text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
+        The text embeddings which are basically the pooled output of [`FlavaTextModel`].
+    text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
+        The output of the [`FlavaTextModel`].
+    multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
+        The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
+    multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
+        The output of the [`FlavaMultimodalModel`].
+    """
+
+    image_embeddings: Optional[torch.FloatTensor] = None
+    image_output: Optional[BaseModelOutputWithPooling] = None
+    text_embeddings: Optional[torch.FloatTensor] = None
+    text_output: Optional[BaseModelOutputWithPooling] = None
+    multimodal_embeddings: Optional[torch.FloatTensor] = None
+    multimodal_output: Optional[BaseModelOutputWithPooling] = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_output", "image_output", "multimodal_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class representing pretraining losses from FLAVA model
+    """
+)
+class FlavaLosses(ModelOutput):
+    r"""
+    mim (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels` and `pixel_values` are present, `input_ids_masked` is absent and `mim_weight` > 0.):
+        Masked Image Modeling loss as used in BeIT calculated only for unimodal image data.
+    mlm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels` and `input_ids_masked` are present, `pixel_values` is absent and `mlm_weight` > 0.):
+        Masked Language Modeling loss as used in BERT calculated only for unimodal text data.
+    itm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `itm_labels`, `input_ids_masked`, `pixel_values` are present and `itm_weight` > 0.):
+        Image Text Matching (ITM) loss calculated for paired image-text data. Note that ITM loss is calculated on
+        masked pairs in FLAVA.
+    global_contrastive (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `input_ids` and `pixel_values` are present and `global_contrastive_weight` > 0.):
+        Contrastive loss for image-text similarity similar to CLIP but calculated globally for paired image-text
+        data. This is calculated on unmasked images and texts.
+    mmm_image (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_image_weight` > 0.):
+        Masked Multimodal Modeling loss's image component calculated on paired image-text data.
+    mmm_text (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_text_weight` > 0.):
+        Masked Multimodal Modeling loss's text component calculated on paired image-text data.
+    """
+
+    mim: Optional[torch.FloatTensor] = None
+    mlm: Optional[torch.FloatTensor] = None
+    itm: Optional[torch.FloatTensor] = None
+    global_contrastive: Optional[torch.FloatTensor] = None
+    mmm_image: Optional[torch.FloatTensor] = None
+    mmm_text: Optional[torch.FloatTensor] = None
+
+    def all_none(self) -> bool:
+        all_none = True
+        for v in self.values():
+            if v is not None:
+                all_none = False
+                break
+        return all_none
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output from FlavaForPreTraining containing embeddings, and outputs from individual encoders.
+
+    Note that `image_embeddings` and `text_embeddings` returned are similar to pooled output returned from a
+    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
+    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.
+    """
+)
+class FlavaForPreTrainingOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor`, *optional*, returned when `return_loss` is True):
+        Total loss calculated for this model.
+    loss_info (`FlavaLosses`):
+        Detailed info for FLAVA Pretraining losses. Check `FlavaLosses` class description for the information on
+        the keys.
+    image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
+        The image embeddings which are basically the pooled output of [`FlavaImageModel`].
+    image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
+        The output of the [`FlavaImageModel`].
+    text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
+        The text embeddings which are basically the pooled output of [`FlavaTextModel`].
+    text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
+        The output of the [`FlavaTextModel`].
+    multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
+        The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
+    multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
+        The output of the [`FlavaMultimodalModel`].
+    image_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
+        The image embeddings which are basically the pooled output of [`FlavaImageModel`]. Uses `bool_masked_pos`
+        to create masked images.
+    image_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
+        The output of the [`FlavaImageModel`]. Uses `bool_masked_pos` to create masked images.
+    text_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids_masked` are present):
+        The text embeddings which are basically the pooled output of [`FlavaTextModel`].
+    text_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` are present):
+        The output of the [`FlavaTextModel`].
+    multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
+        The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
+    multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
+        The output of the [`FlavaMultimodalModel`].
+    mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):
+        The logits for MIM unimodal loss. Uses `book_masked_pos` to get masked patches. The flattened output is
+            returned when `bool_masked_pos` has some of the patches masked.
+    mlm_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(total_masked_seq_length, text_vocab_size)`, *optional*, returned when `input_ids_masked` are present and `pixel_values` are not):
+        The logits for MLM unimodal loss. The flattened output is returned when `input_ids_masked` has some of
+            the tokens masked.
+    itm_logits (`torch.FloatTensor` of shape `(batch_size, 2)`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
+        The logits for ITM loss. Note that ITM loss is calculated on masked pairs in FLAVA.
+    contrastive_logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+        The scaled dot product scores between `image_embeddings` and `text_embeddings` but passed through FLAVA's
+        `image_projection` and `text_projection` layers respectively. This represents the image-text similarity
+        scores. This is calculated on unmasked images and texts.
+    contrastive_logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+        The scaled dot product scores between `text_embeddings` and `image_embeddings` but passed through FLAVA's
+        `text_projection` and `image_projection` layers respectively. This is calculated on unmasked images and
+        texts.
+    mmm_image_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape`(total_masked_patches, image_vocab_size)`, *optional*, returned when `pixel_values` and `input_ids_masked` are present):
+        The logits for MMM image multimodal loss. Uses `book_masked_pos` to get masked patches. The flattened
+            output is returned when `bool_masked_pos` has some of the patches masked.
+    mmm_text_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(`(total_masked_seq_length, text_vocab_size)`), *optional*, returned when `pixel_values` and `input_ids_masked` are present):
+        The logits for MMM text multimodal loss. The flattened output is returned when `input_ids_masked` has
+            some of the tokens masked.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_info: FlavaLosses = None
+    image_embeddings: Optional[torch.FloatTensor] = None
+    image_output: Optional[BaseModelOutputWithPooling] = None
+    text_embeddings: Optional[torch.FloatTensor] = None
+    text_output: Optional[BaseModelOutputWithPooling] = None
+    multimodal_embeddings: Optional[torch.FloatTensor] = None
+    multimodal_output: Optional[BaseModelOutputWithPooling] = None
+    image_masked_embeddings: Optional[torch.FloatTensor] = None
+    image_masked_output: Optional[BaseModelOutputWithPooling] = None
+    text_masked_embeddings: Optional[torch.FloatTensor] = None
+    text_masked_output: Optional[BaseModelOutputWithPooling] = None
+    multimodal_masked_embeddings: Optional[torch.FloatTensor] = None
+    multimodal_masked_output: Optional[BaseModelOutputWithPooling] = None
+    mim_logits: Optional[torch.FloatTensor] = None
+    mlm_logits: Optional[torch.FloatTensor] = None
+    itm_logits: Optional[torch.FloatTensor] = None
+    contrastive_logits_per_image: Optional[torch.FloatTensor] = None
+    contrastive_logits_per_text: Optional[torch.FloatTensor] = None
+    mmm_image_logits: Optional[torch.FloatTensor] = None
+    mmm_text_logits: Optional[torch.FloatTensor] = None
+
+    def to_tuple(self) -> tuple[Any]:
+        transformer_outputs = [
+            "text_output",
+            "image_output",
+            "multimodal_output",
+            "text_masked_output",
+            "image_masked_output",
+            "multimodal_masked_output",
+        ]
+        return tuple(self[k] if k not in transformer_outputs else getattr(self, k).to_tuple() for k in self.keys())
+
+
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/image_transformer.py
+class FlavaImageEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+    """
+
+    def __init__(self, config: FlavaImageConfig, use_mask_token: bool = False) -> None:
+        super().__init__()
+
+        use_mask_token = use_mask_token or config.mask_token
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
+        self.patch_embeddings = PatchEmbeddings(
+            image_size=config.image_size,
+            patch_size=config.patch_size,
+            num_channels=config.num_channels,
+            embed_dim=config.hidden_size,
+        )
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        batch_size, seq_len, _ = embeddings.size()
+        if bool_masked_pos is not None:
+            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
+            # B X H X W = B X HW
+            if bool_masked_pos.dim() == 3:
+                bool_masked_pos = bool_masked_pos.view(bool_masked_pos.size(0), -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+# Based on timm implementation, which can be found here:
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/image_transformer.py
+class PatchEmbeddings(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+
+    def __init__(
+        self,
+        image_size: int = 224,
+        patch_size: Union[int, tuple[int, int]] = 16,
+        num_channels: int = 3,
+        embed_dim: int = 768,
+    ):
+        super().__init__()
+        if not isinstance(image_size, collections.abc.Iterable):
+            image_size = (image_size, image_size)
+        if not isinstance(patch_size, collections.abc.Iterable):
+            patch_size = (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return x
+
+
+class FlavaTextEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ):
+        input_shape = input_ids.size()
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class FlavaSelfAttention(nn.Module):
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = (
+            self.query(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        key_layer = (
+            self.key(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        value_layer = (
+            self.value(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class FlavaSelfOutput(nn.Module):
+    """
+    The residual connection is defined in FlavaLayer (same as ViTLayer) instead of here (as is the case with other
+    models), due to the layernorm applied before each block.
+    """
+
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class FlavaAttention(nn.Module):
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        self.attention = FlavaSelfAttention(config)
+        self.output = FlavaSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
+        self_outputs = self.attention(
+            hidden_states, attention_mask=attention_mask, head_mask=head_mask, output_attentions=output_attentions
+        )
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class FlavaIntermediate(nn.Module):
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    # Copied from transformers.models.vit.modeling_vit.ViTIntermediate.forward
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+class FlavaOutput(nn.Module):
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    # Copied from transformers.models.vit.modeling_vit.ViTOutput.forward
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+class FlavaLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: FlavaPossibleConfigs) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = FlavaAttention(config)
+        self.intermediate = FlavaIntermediate(config)
+        self.output = FlavaOutput(config)
+
+        # TODO: Check fp32 layer norm possibility
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViT, layernorm is applied before self-attention
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in ViT, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class FlavaEncoder(nn.Module):
+    def __init__(self, config: FlavaConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([FlavaLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
+        )
+
+
+class FlavaPooler(nn.Module):
+    def __init__(self, config: FlavaPossibleConfigs):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@auto_docstring
+class FlavaPreTrainedModel(PreTrainedModel):
+    config: FlavaConfig
+    base_model_prefix = "flava"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, FlavaMaskedPredictionHead):
+            module.bias.data.zero_()
+        elif isinstance(module, FlavaImageEmbeddings):
+            module.cls_token.data.zero_()
+            module.position_embeddings.data.zero_()
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+        elif isinstance(module, FlavaMultimodalModel):
+            if module.use_cls_token:
+                module.cls_token.data.zero_()
+        elif isinstance(module, FlavaModel):
+            module.logit_scale.data.fill_(self.config.logit_scale_init_value)
+
+
+@auto_docstring
+class FlavaImageModel(FlavaPreTrainedModel):
+    config: FlavaImageConfig
+    # This override allows us to load FlavaImageModel from FlavaModel/FlavaForPreTraining checkpoints.
+    base_model_prefix = "flava.image_model"
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: FlavaImageConfig, add_pooling_layer: bool = True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+
+        self.config = config
+
+        self.embeddings = FlavaImageEmbeddings(config)
+        self.encoder = FlavaEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = FlavaPooler(config) if add_pooling_layer else None
+
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.patch_embeddings
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.embeddings.patch_embeddings = value
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring
+class FlavaTextModel(FlavaPreTrainedModel):
+    config: FlavaTextConfig
+    # This override allows us to load FlavaTextModel from FlavaModel/FlavaForPreTraining checkpoints.
+    base_model_prefix = "flava.text_model"
+
+    def __init__(self, config: FlavaTextConfig, add_pooling_layer: bool = True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = FlavaTextEmbeddings(config)
+        self.encoder = FlavaEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = FlavaPooler(config) if add_pooling_layer else None
+
+        self.post_init()
+
+    def get_input_embeddings(self) -> PatchEmbeddings:
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=input_ids.device)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, input_ids.device
+        )
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring
+class FlavaMultimodalModel(FlavaPreTrainedModel):
+    config: FlavaMultimodalConfig
+    # This override allows us to load FlavaMultimodalModel from FlavaModel/FlavaForPreTraining checkpoints.
+    base_model_prefix = "flava.multimodal_model"
+    main_input_name = "hidden_states"
+
+    def __init__(self, config: FlavaMultimodalConfig, add_pooling_layer=True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+        self.use_cls_token = self.config.use_cls_token
+        if self.use_cls_token:
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+
+        self.encoder = FlavaEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = FlavaPooler(config) if add_pooling_layer else None
+
+        self.post_init()
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        hidden_states (`torch.FloatTensor` of shape `(batch_size, image_num_patches + text_seq_len, hidden_size)`):
+            The concatenated hidden states of unimodal encoders.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, seq_length, _ = hidden_states.size()
+
+        if self.use_cls_token:
+            cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+            hidden_states = torch.cat((cls_tokens, hidden_states), dim=1)
+            seq_length += 1
+
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length), device=hidden_states.device)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, (batch_size, seq_length), hidden_states.device
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring
+class FlavaModel(FlavaPreTrainedModel):
+    config: FlavaConfig
+
+    def __init__(self, config: FlavaConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, FlavaTextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type FlavaTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.image_config, FlavaImageConfig):
+            raise TypeError(
+                "config.image_config is expected to be of type FlavaImageConfig but is of type"
+                f" {type(config.image_config)}."
+            )
+
+        if not isinstance(config.multimodal_config, FlavaMultimodalConfig):
+            raise TypeError(
+                "config.multimodal_config is expected to be of type FlavaMultimodalConfig but "
+                + f"is of type {type(config.multimodal_config)}."
+            )
+
+        text_config = config.text_config
+        image_config = config.image_config
+        multimodal_config = config.multimodal_config
+
+        self.projection_dim = config.projection_dim
+        self.text_hidden_size = text_config.hidden_size
+        self.image_hidden_size = image_config.hidden_size
+        self.mm_hidden_size = multimodal_config.hidden_size
+
+        self.text_model = FlavaTextModel(text_config)
+        self.image_model = FlavaImageModel(image_config)
+        self.multimodal_model = FlavaMultimodalModel(multimodal_config)
+
+        self.image_projection = nn.Linear(self.image_hidden_size, self.projection_dim)
+        self.text_projection = nn.Linear(self.text_hidden_size, self.projection_dim)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+        self.image_to_mm_projection = nn.Linear(self.image_hidden_size, self.mm_hidden_size)
+        self.text_to_mm_projection = nn.Linear(self.text_hidden_size, self.mm_hidden_size)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @filter_out_non_signature_kwargs()
+    @auto_docstring
+    def get_text_features(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`FlavaTextModel`].
+
+        Examples:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, FlavaModel
+
+        >>> model = FlavaModel.from_pretrained("{0}")
+        >>> processor = AutoProcessor.from_pretrained("{0}")
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], max_length=77, padding="max_length", return_tensors="pt"
+        ... )
+        >>> with torch.inference_mode():
+        ...     text_features = model.get_text_features(**inputs)
+        ```
+        """
+        text_outputs: BaseModelOutputWithPooling = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+        )
+        pooled_output = text_outputs.last_hidden_state
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @filter_out_non_signature_kwargs()
+    @auto_docstring
+    def get_image_features(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`FlavaImageModel`].
+
+        Examples:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, FlavaModel
+        >>> from transformers.image_utils import load_image
+
+        >>> model = FlavaModel.from_pretrained("{0}")
+        >>> processor = AutoProcessor.from_pretrained("{0}")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = load_image(url)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> with torch.inference_mode():
+        ...     image_features = model.get_image_features(**inputs)
+        ```
+        """
+        image_outputs: BaseModelOutputWithPooling = self.image_model(
+            pixel_values=pixel_values,
+            bool_masked_pos=bool_masked_pos,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+        pooled_output = image_outputs.last_hidden_state
+        image_features = self.image_projection(pooled_output)
+
+        return image_features
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+        skip_multimodal_encoder: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: bool = True,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, FlavaOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        image_attention_mask (`torch.Tensor` of shape `(batch_size, image_num_patches)`, *optional*):
+            Mask to avoid performing attention on padding pixel values for image inputs. Mask values selected in `[0, 1]`:
+            - 1 for pixel values that are real (i.e., **not masked**),
+            - 0 for pixel values that are padding (i.e., **masked**).
+        skip_multimodal_encoder (*bool*, *optional*):
+            Skip any calculations for multimodal encoder. Useful if multimodal encoding is not going to be used.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, FlavaModel
+
+        >>> model = FlavaModel.from_pretrained("facebook/flava-full")
+        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True)
+
+        >>> outputs = model(**inputs)
+
+        >>> image_embeddings = outputs.image_embeddings
+        >>> text_embeddings = outputs.text_embeddings
+        >>> multimodal_embeddings = outputs.multimodal_embeddings
+
+        >>> outputs.image_embeddings.shape
+        torch.Size([1, 197, 768])
+
+        >>> text_embeddings.shape
+        torch.Size([1, 7, 768])
+
+        >>> multimodal_embeddings.shape
+        torch.Size([1, 205, 768])
+        ```
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        if not output_hidden_states:
+            raise ValueError("FLAVA model requires hidden states to work. Please set `output_hidden_states=True`")
+        image_embeddings = None
+        image_states = None
+        image_mm_projection = None
+        image_output = None
+        if pixel_values is not None:
+            image_output = self.image_model(
+                pixel_values=pixel_values,
+                bool_masked_pos=bool_masked_pos,
+                attention_mask=image_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            image_embeddings, image_states = image_output[0], image_output[2]
+            # Note that these states don't use final layernorm in the transformer model
+            image_mm_projection = self.image_to_mm_projection(image_states[-1])
+
+        text_embeddings = None
+        text_states = None
+        text_mm_projection = None
+        text_output = None
+        if input_ids is not None:
+            text_output = self.text_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                token_type_ids=token_type_ids,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+            text_embeddings, text_states = text_output[0], text_output[2]
+            # Note that these states don't use final layernorm in the transformer model
+            text_mm_projection = self.text_to_mm_projection(text_states[-1])
+
+        multimodal_embeddings = None
+        multimodal_output = None
+        if image_mm_projection is not None and text_mm_projection is not None and not skip_multimodal_encoder:
+            if attention_mask is not None:
+                batch_size, seq_len, _ = image_mm_projection.shape
+                if self.multimodal_model.use_cls_token:
+                    seq_len += 1
+                attention_mask_image = torch.ones(batch_size, seq_len, device=image_mm_projection.device)
+                attention_multimodal = torch.cat([attention_mask_image, attention_mask], dim=1)
+            else:
+                attention_multimodal = None
+            multimodal_input = torch.cat([image_mm_projection, text_mm_projection], dim=1)
+            multimodal_output = self.multimodal_model(
+                multimodal_input, attention_mask=attention_multimodal, return_dict=return_dict
+            )
+            multimodal_embeddings = multimodal_output[0]
+
+        if not return_dict:
+            return (
+                image_embeddings,
+                image_output,
+                text_embeddings,
+                text_output,
+                multimodal_embeddings,
+                multimodal_output,
+            )
+
+        return FlavaModelOutput(
+            image_embeddings=image_embeddings,
+            image_output=image_output,
+            text_embeddings=text_embeddings,
+            text_output=text_output,
+            multimodal_embeddings=multimodal_embeddings,
+            multimodal_output=multimodal_output,
+        )
+
+
+class FlavaImageCodebookResPath(nn.Module):
+    def __init__(self, in_size: int, out_size: int, **kwargs):
+        super().__init__()
+        hid_size = out_size // 4
+
+        path = OrderedDict()
+        path["relu_1"] = nn.ReLU()
+        path["conv_1"] = nn.Conv2d(in_size, hid_size, kernel_size=3, padding=1)
+        path["relu_2"] = nn.ReLU()
+        path["conv_2"] = nn.Conv2d(hid_size, hid_size, kernel_size=3, padding=1)
+        path["relu_3"] = nn.ReLU()
+        path["conv_3"] = nn.Conv2d(hid_size, hid_size, kernel_size=3, padding=1)
+        path["relu_4"] = nn.ReLU()
+        path["conv_4"] = nn.Conv2d(hid_size, out_size, kernel_size=1, padding=0)
+
+        self.path = nn.Sequential(path)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.path(x)
+
+
+class FlavaImageCodebookBlock(nn.Module):
+    def __init__(self, in_size: int, out_size: int, num_layers: int, **kwargs):
+        super().__init__()
+
+        self.post_gain = 1 / (num_layers**2)
+
+        if in_size != out_size:
+            self.id_path = nn.Conv2d(in_size, out_size, kernel_size=1, padding=0)
+        else:
+            self.id_path = nn.Identity()
+
+        self.res_path = FlavaImageCodebookResPath(in_size, out_size)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.id_path(x) + self.post_gain * self.res_path(x)
+
+
+class FlavaImageCodebookLayerGroup(nn.Module):
+    def __init__(self, num_blocks: int, num_layers: int, in_size: int, out_size: int, use_pool: bool = True):
+        super().__init__()
+        blocks = OrderedDict()
+        for i in range(num_blocks):
+            if i == 0:
+                blocks[f"block_{i + 1}"] = FlavaImageCodebookBlock(in_size, out_size, num_layers)
+            else:
+                blocks[f"block_{i + 1}"] = FlavaImageCodebookBlock(out_size, out_size, num_layers)
+
+        if use_pool:
+            blocks["pool"] = nn.MaxPool2d(kernel_size=2)
+
+        self.group = nn.Sequential(blocks)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.group(x)
+
+
+# Inspired by DALLE Encoder in https://github.com/openai/DALL-E/blob/5be4b236bc3ade6943662354117a0e83752cc322/dall_e/encoder.py#L42
+@auto_docstring(
+    custom_intro="""
+    The FLAVA's image codebook model inspired from DALL-E's original encoder. Outputs raw hidden states and can be used
+    to generate image tokens for an image based on DALL-E's vocab. Used to generate labels for MIM. Use
+    `get_codebook_indices` to get image tokens for an image.
+    """
+)
+class FlavaImageCodebook(FlavaPreTrainedModel):
+    base_model_prefix = ""
+    config: FlavaImageCodebookConfig
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = False
+
+    def __init__(
+        self,
+        config: FlavaImageCodebookConfig,
+        **kwargs: Any,
+    ):
+        super().__init__(config)
+
+        self.config = config
+        self.num_groups = config.num_groups
+        self.input_channels = config.input_channels
+        self.num_blocks_per_group = config.num_blocks_per_group
+        self.hidden_size = config.hidden_size
+        self.vocab_size = config.vocab_size
+
+        num_layers = self.num_groups * self.num_blocks_per_group
+
+        output_blocks = OrderedDict()
+        output_blocks["relu"] = nn.ReLU()
+        output_blocks["conv"] = nn.Conv2d(8 * self.hidden_size, self.vocab_size, kernel_size=1, padding=0)
+
+        blocks = OrderedDict()
+        blocks["input"] = nn.Conv2d(self.input_channels, 1 * self.hidden_size, kernel_size=7, padding=3)
+        blocks["group_1"] = FlavaImageCodebookLayerGroup(
+            self.num_blocks_per_group, num_layers, 1 * self.hidden_size, 1 * self.hidden_size
+        )
+        blocks["group_2"] = FlavaImageCodebookLayerGroup(
+            self.num_blocks_per_group, num_layers, 1 * self.hidden_size, 2 * self.hidden_size
+        )
+        blocks["group_3"] = FlavaImageCodebookLayerGroup(
+            self.num_blocks_per_group, num_layers, 2 * self.hidden_size, 4 * self.hidden_size
+        )
+        blocks["group_4"] = FlavaImageCodebookLayerGroup(
+            self.num_blocks_per_group, num_layers, 4 * self.hidden_size, 8 * self.hidden_size, use_pool=False
+        )
+        blocks["output"] = nn.Sequential(output_blocks)
+
+        self.blocks = nn.Sequential(blocks)
+
+        self.post_init()
+
+        if self.config.freeze:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def get_codebook_indices(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        f"""
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
+                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoImageProcessor, FlavaImageCodebook
+
+        >>> model = FlavaImageCodebook.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}")
+        >>> image_processor = AutoImageProcessor.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
+        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)
+
+        >>> outputs = model.get_codebook_indices(**inputs)
+        ```
+        """
+        z_logits = self.blocks(pixel_values)
+        return torch.argmax(z_logits, axis=1)
+
+    def get_codebook_probs(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        z_logits = self.blocks(pixel_values)
+        return nn.Softmax(dim=1)(z_logits)
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        f"""
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
+                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoImageProcessor, FlavaImageCodebook
+
+        >>> model = FlavaImageCodebook.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}")
+        >>> image_processor = AutoImageProcessor.from_pretrained("{_CHECKPOINT_FOR_CODEBOOK_DOC}")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
+        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)
+
+        >>> outputs = model(**inputs)
+        >>> print(outputs.shape)
+        (1, 196)
+        ```
+        """
+        if len(pixel_values.shape) != 4:
+            raise ValueError(f"input shape {pixel_values.shape} is not 4d")
+        if pixel_values.shape[1] != self.input_channels:
+            raise ValueError(f"input has {pixel_values.shape[1]} channels but model built for {self.input_channels}")
+        return self.blocks(pixel_values)
+
+
+class FlavaPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class FlavaMaskedPredictionHead(nn.Module):
+    def __init__(self, config, weight=None):
+        super().__init__()
+        self.config = config
+        self.transform = FlavaPredictionHeadTransform(config)
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        if weight is not None:
+            self.decoder.weight = weight
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, x):
+        x = self.transform(x)
+        x = self.decoder(x)
+        return x
+
+
+class FlavaITMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pooler = FlavaPooler(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, x):
+        x = self.pooler(x)
+        x = self.seq_relationship(x)
+        return x
+
+
+class FlavaGlobalContrastiveHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.global_backprop_contrastive = config.global_backprop_contrastive
+
+    def forward(self, image_embeddings, text_embeddings, logit_scale):
+        temperature = torch.exp(logit_scale)
+        if not torch.distributed.is_available() or not torch.distributed.is_initialized():
+            labels = torch.arange(image_embeddings.size(0), device=image_embeddings.device)
+            image_embeddings_all = [image_embeddings]
+            text_embeddings_all = [text_embeddings]
+        else:
+            local_batch_size = image_embeddings.size(0)
+            world_size = torch.distributed.get_world_size()
+
+            if self.global_backprop_contrastive:
+                # `torch.distributed.nn.functional.all_gather` does backprop on all active workers
+                # whereas `torch.distributed.all_gather` does only backpropagates on the current worker.
+                image_embeddings_all = torch.distributed.nn.functional.all_gather(image_embeddings)
+                text_embeddings_all = torch.distributed.nn.functional.all_gather(text_embeddings)
+            else:
+                image_embeddings_all = [torch.zeros_like(text_embeddings) for _ in range(world_size)]
+                text_embeddings_all = [torch.zeros_like(image_embeddings) for _ in range(world_size)]
+                torch.distributed.all_gather(image_embeddings_all, image_embeddings)
+                torch.distributed.all_gather(text_embeddings_all, text_embeddings)
+
+            labels = local_batch_size * torch.distributed.get_rank() + torch.arange(
+                local_batch_size, device=image_embeddings.device
+            )
+
+        image_embeddings_all = torch.cat(image_embeddings_all)
+        text_embeddings_all = torch.cat(text_embeddings_all)
+
+        logits_per_image = torch.matmul(image_embeddings, text_embeddings_all.transpose(0, 1)) * temperature
+        logits_per_text = torch.matmul(text_embeddings, image_embeddings_all.transpose(0, 1)) * temperature
+
+        return logits_per_image, logits_per_text, labels
+
+
+@auto_docstring(
+    custom_intro="""
+    The FLAVA model for pretraining which outputs losses, embeddings, logits and transformer outputs.
+    """
+)
+class FlavaForPreTraining(FlavaPreTrainedModel):
+    # Those are linked to xxx.bias
+    _tied_weights_keys = [
+        "mmm_text_head.decoder.bias",
+        "mmm_image_head.decoder.bias",
+        "mlm_head.decoder.bias",
+        "mim_head.decoder.bias",
+    ]
+
+    def __init__(self, config: FlavaConfig, image_codebook: Optional[nn.Module] = None):
+        r"""
+        image_codebook ([`nn.Module`]):
+            If passed, the image codebook will be set to this. Otherwise, it will be initialized using the
+            image_codebook_config defined in the config first as the first parameter.
+        """
+        super().__init__(config)
+        self.flava = FlavaModel(config)
+
+        self.image_codebook = image_codebook
+        if self.image_codebook is None and config.init_codebook:
+            self.image_codebook = FlavaImageCodebook(config.image_codebook_config)
+
+        # Levarage text and image encoder configs to create the masked
+        # head since it has the right vocab
+        self.mim_head = FlavaMaskedPredictionHead(config.image_config)
+        self.mlm_head = FlavaMaskedPredictionHead(config.text_config)
+        self.itm_head = FlavaITMHead(config)
+        self.mmm_image_head = FlavaMaskedPredictionHead(config.image_config)
+        self.mmm_text_head = FlavaMaskedPredictionHead(config.text_config)
+        self.global_contrastive_head = FlavaGlobalContrastiveHead(config)
+
+        self.image_vocab_size = config.image_config.vocab_size
+        self.text_vocab_size = config.text_config.vocab_size
+        self.mlm_weight = config.mlm_weight
+        self.mim_weight = config.mim_weight
+        self.global_contrastive_weight = config.global_contrastive_weight
+        self.ce_ignore_index = config.ce_ignore_index
+        self.itm_weight = config.itm_weight
+        self.mmm_image_weight = config.mmm_image_weight
+        self.mmm_text_weight = config.mmm_text_weight
+        self.skip_unmasked_multimodal_encoder = config.skip_unmasked_multimodal_encoder
+
+        self.post_init()
+
+    def _resize_to_2d(self, x: torch.Tensor):
+        if x.dim() > 2:
+            x = x.view(x.size(0), -1)
+        return x
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_ids_masked: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        codebook_pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        image_attention_mask: Optional[torch.Tensor] = None,
+        skip_unmasked_multimodal_encoder: Optional[bool] = None,
+        mlm_labels: Optional[torch.Tensor] = None,
+        mim_labels: Optional[torch.Tensor] = None,
+        itm_labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: bool = True,
+        return_dict: Optional[bool] = None,
+        return_loss: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], FlavaForPreTrainingOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+        input_ids_masked (`torch.LongTensor` of shape `(batch_size, text_seq_len)`):
+            Indices of input sequence tokens in the vocabulary. These ones are the masked version of the original task
+            to be used with MLM. Indices can be obtained using [`AutoTokenizer`] along with
+            [`DataCollatorForMaskedLanguageModeling`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
+        codebook_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_image_patches, patch_size, patch_size, 3)`, *optional*):
+            Pixel values for image patches that are used to compute the image codebook labels for masked image modeling.
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        image_attention_mask (`torch.FloatTensor` of shape `(batch_size, image_num_patches)`, *optional*):
+            Mask to avoid performing attention on padding token indices specifically for images. Mask values selected
+            in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        skip_unmasked_multimodal_encoder (*bool*, *optional*):
+            Skip any calculations for multimodal encoder for unmasked inputs. FLAVA pretraining doesn't need unmasked
+            multimodal embeddings or outputs as of now.
+        mlm_labels (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
+            Labels for computing the left-to-right language and multimodal masked modeling loss (next word prediction).
+            Indices should be in `[-100, 0, ..., text_config.vocab_size - 1]` (see `input_ids` docstring). Tokens with
+            indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0,
+            ..., text_config.vocab_size - 1]`.
+        mim_labels (`torch.LongTensor` of shape `(batch_size, image_num_patches)`, *optional*):
+            Labels for computing the image and multimodal masked modeling loss. Indices should be in `[-100, 0, ...,
+            image_config.vocab_size - 1]`. Tokens with indices set to `-100` are ignored (masked), the loss is only
+            computed for the tokens with labels in `[0, ..., image_config.vocab_size - 1]`. If not passed, they are
+            generated automatically using the image codebook assigned to the model. By default, it uses
+            [`FlavaImageCodebook`]. See [`FlavaImageCodebook`] to understand how to generate mim_labels.
+        itm_labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
+            Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
+            The pairs with 0 will be skipped for calculation of MMM and global contrastive losses as well.
+        return_loss (`bool`, *optional*, default to None):
+            Whether to return calculated loss or not.
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import FlavaForPreTraining, AutoProcessor
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> model = FlavaForPreTraining.from_pretrained("facebook/flava-full")
+        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")
+
+        >>> text = ["a photo of a cat"]
+
+        >>> inputs = processor(
+        ...     images=[image],
+        ...     text=text,
+        ...     return_masks=True,
+        ...     return_codebook_pixels=True,
+        ...     padding=True,
+        ...     max_length=77,
+        ...     return_tensors="pt",
+        ... )
+
+
+        >>> output = model(**inputs)
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_loss = return_loss if return_loss is not None else self.config.return_loss
+
+        skip_unmasked_multimodal_encoder = (
+            skip_unmasked_multimodal_encoder
+            if skip_unmasked_multimodal_encoder is not None
+            else self.skip_unmasked_multimodal_encoder
+        )
+
+        if input_ids_masked is None and input_ids is not None:
+            logger.warning(
+                "`input_ids_masked` isn't passed which means MLM loss won't be calculated correctlySetting it to"
+                " `input_ids` so that model can work. Please pass it if this is unintentional. This is usually OKAY if"
+                " you are doing inference on unmasked text..."
+            )
+            input_ids_masked = input_ids
+
+        flava_output = self.flava(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            image_attention_mask=image_attention_mask,
+            # Don't need unmasked multimodal embedding for anything so skip it
+            # NOTE: ITM uses masked version
+            skip_multimodal_encoder=skip_unmasked_multimodal_encoder,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            # Pass true to have deterministic outputs
+            return_dict=True,
+        )
+
+        flava_masked_output = self.flava(
+            input_ids=input_ids_masked,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            image_attention_mask=image_attention_mask,
+            bool_masked_pos=bool_masked_pos,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        pos_mask = None
+
+        image_embeddings = flava_output.image_embeddings
+        text_embeddings = flava_output.text_embeddings
+        image_masked_embeddings = flava_masked_output.image_embeddings
+        text_masked_embeddings = flava_masked_output.text_embeddings
+        multimodal_masked_embeddings = flava_masked_output.multimodal_embeddings
+
+        total_loss = mim_loss = mlm_loss = mmm_text_loss = mmm_image_loss = gc_loss = itm_loss = None
+        mim_logits = mlm_logits = mmm_text_logits = mmm_image_logits = None
+        itm_logits = logits_per_image = logits_per_text = None
+
+        # Calculate mim_labels if necessary from the image_codebook
+        if image_masked_embeddings is not None or multimodal_masked_embeddings is not None:
+            if mim_labels is None and return_loss:
+                if self.image_codebook is None:
+                    raise RuntimeError(
+                        "`return_loss` is set to True but the image codebook is not initialized and no `mim_labels` "
+                        " have been passed. Reinstantiate the model with `init_codebook` set to True or "
+                        "pass in your custom `mim_labels`"
+                    )
+                if codebook_pixel_values is None:
+                    raise ValueError(
+                        "`codebook_pixel_value` are required to generate `mim_labels` if loss is expected. "
+                        "Call `AutoProcessor` with `return_codebook_pixels` set to True"
+                    )
+                mim_labels = self.image_codebook.get_codebook_indices(codebook_pixel_values)
+        # Unimodal MIM Loss
+        # If multimodal embeddings are present, we will calculate MMM loss
+        if self.mim_weight > 0 and image_masked_embeddings is not None and multimodal_masked_embeddings is None:
+            sequence_for_image = image_masked_embeddings
+
+            if mim_labels is not None:
+                mim_labels = self._resize_to_2d(mim_labels)
+                bool_masked_pos = self._resize_to_2d(bool_masked_pos)
+                mim_labels[bool_masked_pos.ne(True)] = self.ce_ignore_index
+
+                sequence_for_image = sequence_for_image[:, -mim_labels.size(1) :, :]
+                masked_tokens = mim_labels.ne(self.ce_ignore_index)
+                mim_labels_filtered = mim_labels[masked_tokens]
+                sequence_for_image = sequence_for_image[masked_tokens, :]
+                mim_logits = self.mim_head(sequence_for_image)
+                if return_loss:
+                    mim_loss = nn.functional.cross_entropy(
+                        mim_logits.view(-1, self.image_vocab_size), mim_labels_filtered.view(-1)
+                    )
+                    mim_loss *= self.mim_weight
+            else:
+                mim_logits = self.mim_head(sequence_for_image)
+
+        # Unimodal MLM Loss
+        if self.mlm_weight > 0 and text_masked_embeddings is not None and multimodal_masked_embeddings is None:
+            sequence_for_text = text_masked_embeddings
+            if mlm_labels is not None:
+                mlm_labels = self._resize_to_2d(mlm_labels)
+                sequence_for_text = sequence_for_text[:, -mlm_labels.size(1) :, :]
+                masked_tokens = mlm_labels.ne(self.ce_ignore_index)
+                mlm_labels_filtered = mlm_labels[masked_tokens]
+                sequence_for_text = sequence_for_text[masked_tokens, :]
+                mlm_logits = self.mlm_head(sequence_for_text)
+                if return_loss:
+                    mlm_loss = nn.functional.cross_entropy(
+                        mlm_logits.view(-1, self.text_vocab_size), mlm_labels_filtered.view(-1)
+                    )
+                    mlm_loss *= self.mlm_weight
+            else:
+                mlm_logits = self.mlm_head(sequence_for_text)
+
+        # ITM Loss
+        if self.itm_weight > 0 and multimodal_masked_embeddings is not None:
+            itm_logits = self.itm_head(multimodal_masked_embeddings)
+
+            if itm_labels is not None:
+                pos_pairs = itm_labels.ne(0)
+                pos_mask = torch.where(pos_pairs.any(), pos_pairs, pos_pairs.new([True]))
+                if return_loss:
+                    itm_loss = nn.functional.cross_entropy(itm_logits, itm_labels)
+                    itm_loss *= self.itm_weight
+
+                if multimodal_masked_embeddings is not None:
+                    multimodal_masked_embeddings = multimodal_masked_embeddings[pos_mask]
+
+                if mlm_labels is not None:
+                    mlm_labels = mlm_labels[pos_mask]
+
+                if mim_labels is not None:
+                    mim_labels = mim_labels[pos_mask]
+                    bool_masked_pos = bool_masked_pos[pos_mask]
+
+        # MMM Image Loss
+        if multimodal_masked_embeddings is not None and self.mmm_image_weight > 0:
+            sequence_for_image = multimodal_masked_embeddings
+            end_index = image_masked_embeddings.size(1) - 1
+            sequence_for_image = sequence_for_image[:, 2 : 2 + end_index, :]
+
+            if mim_labels is not None:
+                mim_labels = self._resize_to_2d(mim_labels)
+                bool_masked_pos = self._resize_to_2d(bool_masked_pos)
+                mim_labels[bool_masked_pos.ne(True)] = self.ce_ignore_index
+
+                masked_tokens = mim_labels.ne(self.ce_ignore_index)
+                mim_labels_filtered = mim_labels[masked_tokens]
+                sequence_for_image = sequence_for_image[masked_tokens, :]
+                mmm_image_logits = self.mmm_image_head(sequence_for_image)
+                if return_loss:
+                    mmm_image_loss = nn.functional.cross_entropy(
+                        mmm_image_logits.view(-1, self.image_vocab_size), mim_labels_filtered.view(-1)
+                    )
+                    mmm_image_loss *= self.mmm_image_weight
+            else:
+                mmm_image_logits = self.mmm_image_head(sequence_for_image)
+
+        # MMM Text Loss
+        if multimodal_masked_embeddings is not None and self.mmm_text_weight > 0:
+            sequence_for_text = multimodal_masked_embeddings
+            sequence_for_text = sequence_for_text[:, -text_masked_embeddings.size(1) :, :]
+
+            if mlm_labels is not None:
+                mlm_labels = self._resize_to_2d(mlm_labels)
+                masked_tokens = mlm_labels.ne(self.ce_ignore_index)
+                mlm_labels_filtered = mlm_labels[masked_tokens]
+                sequence_for_text = sequence_for_text[masked_tokens, :]
+                mmm_text_logits = self.mmm_text_head(sequence_for_text)
+                if return_loss:
+                    mmm_text_loss = nn.functional.cross_entropy(
+                        mmm_text_logits.view(-1, self.text_vocab_size), mlm_labels_filtered.view(-1)
+                    )
+                    mmm_text_loss *= self.mmm_text_weight
+            else:
+                mmm_text_logits = self.mmm_text_head(sequence_for_text)
+
+        # Global Contrastive Loss
+        if image_embeddings is not None and text_embeddings is not None and self.global_contrastive_weight > 0:
+            text_embedding = self.flava.text_projection(text_embeddings[:, 0, :])
+            text_embedding = nn.functional.normalize(text_embedding, dim=-1)
+
+            image_embedding = self.flava.image_projection(image_embeddings[:, 0, :])
+            image_embedding = nn.functional.normalize(image_embedding, dim=-1)
+
+            self.flava.logit_scale.data.clamp_(LOGIT_SCALE_CLAMP_MIN, LOGIT_SCALE_CLAMP_MAX)
+
+            logits_per_image, logits_per_text, gc_labels = self.global_contrastive_head(
+                image_embedding, text_embedding, self.flava.logit_scale
+            )
+
+            # Apply ITM negative mask if any
+            if pos_mask is not None:
+                logits_per_image = logits_per_image[pos_mask]
+                logits_per_text = logits_per_text[pos_mask]
+                gc_labels = gc_labels[pos_mask]
+
+            if return_loss:
+                gc_loss_image = nn.functional.cross_entropy(logits_per_image, gc_labels)
+                gc_loss_text = nn.functional.cross_entropy(logits_per_text, gc_labels)
+                gc_loss = (gc_loss_image + gc_loss_text) / 2
+                gc_loss *= self.global_contrastive_weight
+
+        flava_losses = FlavaLosses(
+            mim=mim_loss,
+            mlm=mlm_loss,
+            itm=itm_loss,
+            global_contrastive=gc_loss,
+            mmm_image=mmm_image_loss,
+            mmm_text=mmm_text_loss,
+        )
+
+        if return_loss and not flava_losses.all_none():
+            total_loss = sum(loss if loss is not None else 0 for loss in flava_losses.values())
+
+        if not return_dict:
+            output = (
+                image_embeddings,
+                flava_output.image_output.to_tuple() if flava_output.image_output is not None else None,
+                text_embeddings,
+                flava_output.text_output.to_tuple() if flava_output.text_output is not None else None,
+                flava_output.multimodal_embeddings,
+                flava_output.multimodal_output.to_tuple() if flava_output.multimodal_output is not None else None,
+                image_masked_embeddings,
+                flava_masked_output.image_output.to_tuple() if flava_masked_output.image_output is not None else None,
+                text_masked_embeddings,
+                flava_masked_output.text_output.to_tuple() if flava_masked_output.text_output is not None else None,
+                multimodal_masked_embeddings,
+                flava_masked_output.multimodal_output.to_tuple()
+                if flava_masked_output.multimodal_output is not None
+                else None,
+                mim_logits,
+                mlm_logits,
+                itm_logits,
+                logits_per_image,
+                logits_per_image,
+                mmm_image_logits,
+                mmm_text_logits,
+            )
+            if return_loss and not flava_losses.all_none():
+                output = (
+                    total_loss,
+                    flava_losses,
+                ) + output
+
+            # Filter None as transformer by default won't handle it
+            return tuple(x for x in output if x is None)
+
+        return FlavaForPreTrainingOutput(
+            loss=total_loss,
+            loss_info=flava_losses,
+            image_embeddings=image_embeddings,
+            image_output=flava_output.image_output,
+            text_embeddings=text_embeddings,
+            text_output=flava_output.text_output,
+            multimodal_embeddings=flava_output.multimodal_embeddings,
+            multimodal_output=flava_output.multimodal_output,
+            image_masked_embeddings=image_masked_embeddings,
+            image_masked_output=flava_masked_output.image_output,
+            text_masked_embeddings=text_masked_embeddings,
+            text_masked_output=flava_masked_output.text_output,
+            multimodal_masked_embeddings=multimodal_masked_embeddings,
+            multimodal_masked_output=flava_masked_output.multimodal_output,
+            mim_logits=mim_logits,
+            mlm_logits=mlm_logits,
+            itm_logits=itm_logits,
+            contrastive_logits_per_image=logits_per_image,
+            contrastive_logits_per_text=logits_per_text,
+            mmm_image_logits=mmm_image_logits,
+            mmm_text_logits=mmm_text_logits,
+        )
+
+
+__all__ = [
+    "FlavaForPreTraining",
+    "FlavaImageCodebook",
+    "FlavaImageModel",
+    "FlavaModel",
+    "FlavaMultimodalModel",
+    "FlavaPreTrainedModel",
+    "FlavaTextModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/processing_flava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/processing_flava.py
new file mode 100644
index 0000000000000000000000000000000000000000..ceebdb6efa49134c3d078a8dd03b08da7c59fb9d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/processing_flava.py
@@ -0,0 +1,103 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for FLAVA
+"""
+
+import warnings
+from collections.abc import Iterable
+from typing import Optional, Union
+
+from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
+
+
+class FlavaImagesKwargs(ImagesKwargs):
+    # Mask related params
+    return_image_mask: Optional[bool]
+    input_size_patches: Optional[int]
+    total_mask_patches: Optional[int]
+    mask_group_min_patches: Optional[int]
+    mask_group_max_patches: Optional[int]
+    mask_group_min_aspect_ratio: Optional[float]
+    mask_group_max_aspect_ratio: Optional[float]
+    # Codebook related params
+    return_codebook_pixels: Optional[bool]
+    codebook_do_resize: Optional[bool]
+    codebook_size: Optional[bool]
+    codebook_resample: Optional[int]
+    codebook_do_center_crop: Optional[bool]
+    codebook_crop_size: Optional[int]
+    codebook_do_rescale: Optional[bool]
+    codebook_rescale_factor: Optional[Union[int, float]]
+    codebook_do_map_pixels: Optional[bool]
+    codebook_do_normalize: Optional[bool]
+    codebook_image_mean: Optional[Union[float, Iterable[float]]]
+    codebook_image_std: Optional[Union[float, Iterable[float]]]
+
+
+class FlavaProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: FlavaImagesKwargs
+    _defaults = {}
+
+
+class FlavaProcessor(ProcessorMixin):
+    r"""
+    Constructs a FLAVA processor which wraps a FLAVA image processor and a FLAVA tokenizer into a single processor.
+
+    [`FlavaProcessor`] offers all the functionalities of [`FlavaImageProcessor`] and [`BertTokenizerFast`]. See the
+    [`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`FlavaImageProcessor`], *optional*): The image processor is a required input.
+        tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "FlavaImageProcessor"
+    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+    valid_processor_kwargs = FlavaProcessorKwargs
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
+
+
+__all__ = ["FlavaProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/configuration_fuyu.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/configuration_fuyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..40da84e2e780821f26765333a2cee51030e0bea4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/configuration_fuyu.py
@@ -0,0 +1,215 @@
+# coding=utf-8
+# Copyright 2023 Adept AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fuyu model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class FuyuConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FuyuForCausalLM`]. It is used to instantiate an
+    Fuyu model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [adept/fuyu-8b](https://huggingface.co/adept/fuyu-8b).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 262144):
+            Vocabulary size of the Fuyu model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FuyuForCausalLM`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 16384):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 36):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 16384):
+            The maximum sequence length that this model might ever be used with.
+        image_size (`int`, *optional*, defaults to 300):
+            The input image size.
+        patch_size (`int`, *optional*, defaults to 30):
+            The input vision transformer encoding patch size.
+        num_channels (`int`, *optional*, defaults to 3):
+            The input image number of channels.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`. Whether to tie weight embeddings
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie input and output embeddings.
+        rope_theta (`float`, *optional*, defaults to 25000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalFuyu/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        qk_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the Queries and Keys after projecting the hidden states
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio after applying the MLP to the hidden states.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio after computing the attention scores.
+        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
+            Percentage of the query and keys which will have rotary embedding.
+
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the *beginning-of-sequence* token.
+        eos_token_id (`Union[int, list[int]]`, *optional*, defaults to 2):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+        image_token_id (`int`, *optional*, defaults to 71011):
+            The id of the image placeholder token.
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize the `language``[`Aut`].
+
+    ```python
+    >>> from transformers import FuyuConfig
+
+    >>> # Initializing a Fuyu fuyu-7b style configuration
+    >>> configuration = FuyuConfig()
+    ```"""
+
+    model_type = "fuyu"
+    sub_configs = {"text_config": AutoConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=262144,
+        hidden_size=4096,
+        intermediate_size=16384,
+        num_hidden_layers=36,
+        num_attention_heads=64,
+        hidden_act="relu2",
+        max_position_embeddings=16384,
+        image_size=300,
+        patch_size=30,
+        num_channels=3,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=25000.0,
+        rope_scaling=None,
+        qk_layernorm=True,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        partial_rotary_factor=0.5,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        image_token_id=71011,
+        text_config=None,
+        **kwargs,
+    ):
+        if text_config is None:
+            text_config = {
+                "vocab_size": vocab_size,
+                "max_position_embeddings": max_position_embeddings,
+                "hidden_size": hidden_size,
+                "intermediate_size": intermediate_size,
+                "num_hidden_layers": num_hidden_layers,
+                "num_attention_heads": num_attention_heads,
+                "hidden_act": hidden_act,
+                "initializer_range": initializer_range,
+                "layer_norm_eps": layer_norm_eps,
+                "use_cache": use_cache,
+                "rope_theta": rope_theta,
+                "rope_scaling": rope_scaling,
+                "qk_layernorm": qk_layernorm,
+                "hidden_dropout": hidden_dropout,
+                "attention_dropout": attention_dropout,
+                "partial_rotary_factor": partial_rotary_factor,
+                "pad_token_id": pad_token_id,
+                "bos_token_id": bos_token_id,
+                "eos_token_id": eos_token_id,
+                "tie_word_embeddings": tie_word_embeddings,
+            }
+            logger.info("text_config is None. initializing the text model with default values.")
+        text_model_type = text_config.get("model_type", "persimmon")
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+
+        self._vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.qk_layernorm = qk_layernorm
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.partial_rotary_factor = partial_rotary_factor
+        self.image_token_id = image_token_id
+        self._rope_scaling_validation()
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+
+
+__all__ = ["FuyuConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75fd33b864ad4f79952c2cc200e0f7463487ea73
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/configuration_gemma3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/configuration_gemma3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff4dce97e8d9ca46814f5f7dfa539c9671a66939
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/configuration_gemma3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04b13775c660925697a7772373cc0ae6731ef28b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75af5d891127f73d4623327cd32d7af3da7fb20e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/image_processing_gemma3_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/modeling_gemma3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/modeling_gemma3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4989362133b183d823f74918ebec4e4bb37f08c8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/modeling_gemma3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/modular_gemma3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/modular_gemma3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..413b2dcdaa8a1211d30d2a1551e40fa04d29aa87
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/modular_gemma3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/processing_gemma3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/processing_gemma3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4418a73de64afdc17a2e39be34729d24e937970f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3/__pycache__/processing_gemma3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3n/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3n/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e20d9814e9f328d81d239db08e26afa1613fabaa
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma3n/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b2133e5b7c31ede5c7c0e91e4697fd260d1312b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/configuration_git.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/configuration_git.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f272c26bec1f425cc7839e2abf5f210fd984c98
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/configuration_git.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/modeling_git.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/modeling_git.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..309fa053272dd3904836b34c752337c1a773e9fb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/modeling_git.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/processing_git.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/processing_git.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..689d15179be9eee38378f286ee287d58533ca18a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/__pycache__/processing_git.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c4fe84b6a647aba705f5043964eda1ae9a8803c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/configuration_glm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/configuration_glm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7bfc2b32688c827912dab312888c9d585e0504c3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/configuration_glm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/modeling_glm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/modeling_glm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ffba882e79453821d8398fa810b8fae7fd164c7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/modeling_glm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/modular_glm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/modular_glm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1983d78e86a651cef70e3ae00ddb3054ee972f23
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm/__pycache__/modular_glm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8f5622e7d4ce992e732798d02188d46063bc8df
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/configuration_glm4_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/configuration_glm4_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd16fb4a5770058abd4d56849a34ad3bbee25b38
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/configuration_glm4_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/modeling_glm4_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/modeling_glm4_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f633d74e82300345a2eb81b289621a92951b237
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/modeling_glm4_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/modular_glm4_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/modular_glm4_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4da69bbbebfc94795056a3d7d7659141c1de374a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4_moe/__pycache__/modular_glm4_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b6ccc53fc0efb0fc88c2f95586276cd40010fe
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_got_ocr2 import *
+    from .image_processing_got_ocr2 import *
+    from .image_processing_got_ocr2_fast import *
+    from .modeling_got_ocr2 import *
+    from .processing_got_ocr2 import *
+
+
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/modeling_got_ocr2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/modeling_got_ocr2.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb22e62bfd7df1f56b8c5b13ac6fcacb90df6696
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/modeling_got_ocr2.py
@@ -0,0 +1,840 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/got_ocr2/modular_got_ocr2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_got_ocr2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import collections
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from transformers.utils.generic import check_model_inputs
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ..auto import AutoModel
+from .configuration_got_ocr2 import GotOcr2Config, GotOcr2VisionConfig
+
+
+class GotOcr2MLPBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.lin1 = nn.Linear(config.hidden_size, config.mlp_dim)
+        self.lin2 = nn.Linear(config.mlp_dim, config.hidden_size)
+        self.act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.lin1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.lin2(hidden_states)
+        return hidden_states
+
+
+class GotOcr2VisionAttention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+
+    def __init__(self, config, window_size):
+        super().__init__()
+        input_size = (
+            (config.image_size // config.patch_size, config.image_size // config.patch_size)
+            if window_size == 0
+            else (window_size, window_size)
+        )
+
+        self.num_attention_heads = config.num_attention_heads
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.scale = head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.qkv_bias)
+        self.proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.use_rel_pos = config.use_rel_pos
+        if self.use_rel_pos:
+            if input_size is None:
+                raise ValueError("Input size must be provided if using relative positional encoding.")
+
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+
+    def get_rel_pos(self, q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
+        """
+        Get relative positional embeddings according to the relative positions of
+            query and key sizes.
+
+        Args:
+            q_size (int):
+                size of the query.
+            k_size (int):
+                size of key k.
+            rel_pos (`torch.Tensor`):
+                relative position embeddings (L, channel).
+
+        Returns:
+            Extracted positional embeddings according to relative positions.
+        """
+        max_rel_dist = int(2 * max(q_size, k_size) - 1)
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+
+        # Scale the coords with short length if shapes for q and k are different.
+        q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+        k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+        relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+
+        return rel_pos_resized[relative_coords.long()]
+
+    def get_decomposed_rel_pos(
+        self,
+        query: torch.Tensor,
+        rel_pos_h: torch.Tensor,
+        rel_pos_w: torch.Tensor,
+        q_size: tuple[int, int],
+        k_size: tuple[int, int],
+    ) -> torch.Tensor:
+        """
+        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
+
+        Args:
+            query (`torch.Tensor`):
+                query q in the attention layer with shape (batch_size, query_height * query_width, channel).
+            rel_pos_h (`torch.Tensor`):
+                relative position embeddings (Lh, channel) for height axis.
+            rel_pos_w (`torch.Tensor`):
+                relative position embeddings (Lw, channel) for width axis.
+            q_size (tuple):
+                spatial sequence size of query q with (query_height, query_width).
+            k_size (tuple):
+                spatial sequence size of key k with (key_height, key_width).
+
+        Returns:
+            decomposed_rel_pos (`torch.Tensor`):
+                decomposed relative position embeddings.
+        """
+        query_height, query_width = q_size
+        key_height, key_width = k_size
+        relative_position_height = self.get_rel_pos(query_height, key_height, rel_pos_h)
+        relative_position_width = self.get_rel_pos(query_width, key_width, rel_pos_w)
+
+        batch_size, _, dim = query.shape
+        reshaped_query = query.reshape(batch_size, query_height, query_width, dim)
+        rel_h = torch.einsum("bhwc,hkc->bhwk", reshaped_query, relative_position_height)
+        rel_w = torch.einsum("bhwc,wkc->bhwk", reshaped_query, relative_position_width)
+
+        decomposed_rel_pos = rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+
+        return decomposed_rel_pos
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions=None) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size, height, width, _ = hidden_states.shape
+        # qkv with shape (3, batch_size, nHead, height * width, channel)
+        qkv = (
+            self.qkv(hidden_states)
+            .reshape(batch_size, height * width, 3, self.num_attention_heads, -1)
+            .permute(2, 0, 3, 1, 4)
+        )
+        # q, k, v with shape (batch_size * nHead, height * width, channel)
+        query, key, value = qkv.reshape(3, batch_size * self.num_attention_heads, height * width, -1).unbind(0)
+
+        attn_weights = (query * self.scale) @ key.transpose(-2, -1)
+
+        if self.use_rel_pos:
+            decomposed_rel_pos = self.get_decomposed_rel_pos(
+                query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            )
+            decomposed_rel_pos = decomposed_rel_pos.reshape_as(attn_weights)
+            attn_weights = attn_weights + decomposed_rel_pos
+
+        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
+        attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1)
+
+        attn_output = self.proj(attn_output)
+        return attn_output, attn_weights
+
+
+class GotOcr2VisionLayer(GradientCheckpointingLayer):
+    def __init__(self, config, window_size):
+        super().__init__()
+        self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attn = GotOcr2VisionAttention(config, window_size)
+        self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = GotOcr2MLPBlock(config)
+        self.window_size = window_size
+
+    def window_partition(self, hidden_states: torch.Tensor, window_size: int) -> tuple[torch.Tensor, tuple[int, int]]:
+        """
+        Args:
+        Partition into non-overlapping windows with padding if needed.
+            hidden_states (tensor): input tokens with [batch_size, height, width, channel]. window_size (int): window
+            size.
+
+        Returns:
+            windows: windows after partition with [batch_size * num_windows, window_size, window_size, channel].
+            (pad_height, pad_width): padded height and width before partition
+        """
+        batch_size, height, width, channel = hidden_states.shape
+
+        pad_h = (window_size - height % window_size) % window_size
+        pad_w = (window_size - width % window_size) % window_size
+        hidden_states = F.pad(hidden_states, (0, 0, 0, pad_w, 0, pad_h))
+        pad_height, pad_width = height + pad_h, width + pad_w
+
+        hidden_states = hidden_states.reshape(
+            batch_size, pad_height // window_size, window_size, pad_width // window_size, window_size, channel
+        )
+        windows = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(-1, window_size, window_size, channel)
+        return windows, (pad_height, pad_width)
+
+    def window_unpartition(
+        self, windows: torch.Tensor, window_size: int, padding_shape: tuple[int, int], original_shape: tuple[int, int]
+    ) -> torch.Tensor:
+        """
+        Args:
+        Window unpartition into original sequences and removing padding.
+            hidden_states (tensor):
+                input tokens with [batch_size * num_windows, window_size, window_size, channel].
+            window_size (int):
+                window size.
+            padding_shape (Tuple):
+                padded height and width (pad_height, pad_width).
+            original_shape (Tuple): original height and width (height, width) before padding.
+
+        Returns:
+            hidden_states: unpartitioned sequences with [batch_size, height, width, channel].
+        """
+        pad_height, pad_width = padding_shape
+        height, width = original_shape
+        batch_size = windows.shape[0] // (pad_height * pad_width // window_size // window_size)
+        hidden_states = windows.reshape(
+            batch_size, pad_height // window_size, pad_width // window_size, window_size, window_size, -1
+        )
+        hidden_states = (
+            hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().reshape(batch_size, pad_height, pad_width, -1)
+        )
+
+        hidden_states = hidden_states[:, :height, :width, :].contiguous()
+        return hidden_states
+
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.FloatTensor]:
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        # Window partition
+        if self.window_size > 0:
+            height, width = hidden_states.shape[1], hidden_states.shape[2]
+            hidden_states, padding_shape = self.window_partition(hidden_states, self.window_size)
+
+        hidden_states, attn_weights = self.attn(
+            hidden_states=hidden_states,
+        )
+        # Reverse window partition
+        if self.window_size > 0:
+            hidden_states = self.window_unpartition(hidden_states, self.window_size, padding_shape, (height, width))
+
+        hidden_states = residual + hidden_states
+        layernorm_output = self.layer_norm2(hidden_states)
+        hidden_states = hidden_states + self.mlp(layernorm_output)
+        return hidden_states
+
+
+@auto_docstring
+class GotOcr2PreTrainedModel(PreTrainedModel):
+    config: GotOcr2Config
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = False
+    _supports_sdpa = False
+
+    _can_compile_fullgraph = True
+    _supports_flex_attn = False
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, GotOcr2VisionAttention):
+            if module.use_rel_pos:
+                module.rel_pos_h.data.zero_()
+                module.rel_pos_w.data.zero_()
+        elif isinstance(module, GotOcr2VisionEncoder):
+            if module.pos_embed is not None:
+                module.pos_embed.data.zero_()
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for got_ocr2 vision model's outputs that also contains image embeddings obtained by applying the projection
+    layer to the pooler_output.
+    """
+)
+class GotOcr2VisionEncoderOutput(ModelOutput):
+    r"""
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+        The image embeddings obtained by applying the projection layer to the pooler_output.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+class GotOcr2PatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values):
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        embeddings = self.projection(pixel_values).permute(0, 2, 3, 1)
+        return embeddings
+
+
+class GotOcr2LayerNorm(nn.LayerNorm):
+    r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
+    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, *, eps=1e-6, data_format="channels_last", **kwargs):
+        super().__init__(normalized_shape, eps=eps, **kwargs)
+        if data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError(f"Unsupported data format: {data_format}")
+        self.data_format = data_format
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features: Tensor of shape (batch_size, channels, height, width) OR (batch_size, height, width, channels)
+        """
+        if self.data_format == "channels_first":
+            features = features.permute(0, 2, 3, 1)
+            features = super().forward(features)
+            features = features.permute(0, 3, 1, 2)
+        else:
+            features = super().forward(features)
+        return features
+
+
+class GotOcr2VisionNeck(nn.Module):
+    def __init__(self, config: GotOcr2VisionConfig):
+        super().__init__()
+        self.config = config
+
+        self.conv1 = nn.Conv2d(config.hidden_size, config.output_channels, kernel_size=1, bias=False)
+        self.layer_norm1 = GotOcr2LayerNorm(config.output_channels, data_format="channels_first")
+        self.conv2 = nn.Conv2d(config.output_channels, config.output_channels, kernel_size=3, padding=1, bias=False)
+        self.layer_norm2 = GotOcr2LayerNorm(config.output_channels, data_format="channels_first")
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 3, 1, 2)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.layer_norm1(hidden_states)
+
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.layer_norm2(hidden_states)
+        return hidden_states
+
+
+class GotOcr2VisionEncoder(GotOcr2PreTrainedModel):
+    _can_record_outputs = {"hidden_states": GotOcr2VisionLayer, "attentions": GotOcr2VisionAttention}
+
+    def __init__(self, config: GotOcr2VisionConfig):
+        super().__init__(config)
+        self.config = config
+        self.image_size = config.image_size
+        self.patch_embed = GotOcr2PatchEmbeddings(config)
+
+        self.pos_embed = None
+        if config.use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = nn.Parameter(
+                torch.zeros(
+                    1,
+                    config.image_size // config.patch_size,
+                    config.image_size // config.patch_size,
+                    config.hidden_size,
+                )
+            )
+
+        self.layers = nn.ModuleList()
+        for i in range(config.num_hidden_layers):
+            layer = GotOcr2VisionLayer(
+                config,
+                window_size=config.window_size if i not in config.global_attn_indexes else 0,
+            )
+            self.layers.append(layer)
+
+        self.neck = GotOcr2VisionNeck(config)
+
+        self.gradient_checkpointing = False
+
+    def get_input_embeddings(self):
+        return self.patch_embed
+
+    @check_model_inputs(tie_last_hidden_states=False)
+    def forward(
+        self, pixel_values: Optional[torch.FloatTensor] = None, **kwargs: Unpack[TransformersKwargs]
+    ) -> GotOcr2VisionEncoderOutput:
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.patch_embed(pixel_values)
+        if self.pos_embed is not None:
+            hidden_states = hidden_states + self.pos_embed
+        for layer_module in self.layers:
+            hidden_states = layer_module(hidden_states)
+        hidden_states = self.neck(hidden_states)
+        return GotOcr2VisionEncoderOutput(
+            last_hidden_state=hidden_states,
+        )
+
+
+class GotOcr2MultiModalProjector(nn.Module):
+    def __init__(self, config: GotOcr2Config):
+        super().__init__()
+        vision_output_channels = config.vision_config.output_channels
+        language_hidden_size = config.text_config.hidden_size
+        self.conv_upsampler1 = nn.Conv2d(
+            vision_output_channels, vision_output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.conv_upsampler2 = nn.Conv2d(
+            vision_output_channels * 2, language_hidden_size, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.multimodal_projector = nn.Linear(language_hidden_size, language_hidden_size)
+
+    def forward(self, vision_embeddings: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.conv_upsampler1(vision_embeddings)
+        hidden_state = self.conv_upsampler2(hidden_state)
+        hidden_state = hidden_state.flatten(2).permute(0, 2, 1)
+        hidden_state = self.multimodal_projector(hidden_state)
+        return hidden_state
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for GotOcr2 causal language model (or autoregressive) outputs.
+    """
+)
+class GotOcr2CausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for GotOcr2 outputs, with hidden states and attentions.
+    """
+)
+class GotOcr2ModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@auto_docstring(
+    custom_intro="""
+    The GotOcr2 model which consists of a vision backbone and a language model, without a language modeling head.
+    """
+)
+class GotOcr2Model(GotOcr2PreTrainedModel):
+    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+
+    def __init__(self, config: GotOcr2Config):
+        super().__init__(config)
+        self.vision_tower = GotOcr2VisionEncoder(config.vision_config)
+
+        self.multi_modal_projector = GotOcr2MultiModalProjector(config)
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        image_outputs = self.vision_tower(pixel_values).last_hidden_state
+        return self.multi_modal_projector(image_outputs)
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, GotOcr2ModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values=pixel_values.to(inputs_embeds.dtype))
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return GotOcr2ModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The GOT_OCR2 model which consists of a vision backbone and a language model.
+    """
+)
+class GotOcr2ForConditionalGeneration(GotOcr2PreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_tower": "model.vision_tower",
+        "^multi_modal_projector": "model.multi_modal_projector",
+        "^language_model.lm_head": "lm_head",
+    }
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: GotOcr2Config):
+        super().__init__(config)
+        self.model = GotOcr2Model(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        **kwargs,
+    ):
+        return self.model.get_image_features(
+            pixel_values=pixel_values,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            **kwargs,
+        )
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, GotOcr2CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, GotOcr2ForConditionalGeneration, TextStreamer
+
+        >>> model = GotOcr2ForConditionalGeneration.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf").to("cuda")
+        >>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+
+        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(image, return_tensors="pt", color="green").to("cuda")
+
+        >>> # Generate
+        >>> streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
+        >>> generate_ids = model.generate(
+        ...     **inputs,
+        ...     do_sample=False,
+        ...     tokenizer = processor.tokenizer,
+        ...     stop_strings='<|im_end|>',
+        ...     streamer=streamer,
+        ...     max_new_tokens=4096,
+        ... )
+        "You should keep in mind what features from the module should be used, especially
+        when you're planning to sell a template."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return GotOcr2CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+
+__all__ = ["GotOcr2PreTrainedModel", "GotOcr2Model", "GotOcr2ForConditionalGeneration"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/modular_got_ocr2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/modular_got_ocr2.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ecf39fcd03b577b406868a639d3ce8ee9425e3d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/modular_got_ocr2.py
@@ -0,0 +1,483 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...cache_utils import Cache
+from ...configuration_utils import PretrainedConfig
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, can_return_tuple, logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+from ..llava.modeling_llava import (
+    LlavaCausalLMOutputWithPast,
+    LlavaForConditionalGeneration,
+    LlavaModel,
+    LlavaModelOutputWithPast,
+    LlavaPreTrainedModel,
+    TransformersKwargs,
+)
+from ..sam.modeling_sam import (
+    SamMLPBlock,
+    SamPreTrainedModel,
+    SamVisionAttention,
+    SamVisionEncoder,
+    SamVisionLayer,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+class GotOcr2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GotOcr2VisionModel`]. It is used to instantiate a GOT_OCR2
+    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    defaults will yield a similar configuration to that of the SAM ViT-h
+    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        output_channels (`int`, *optional*, defaults to 256):
+            Dimensionality of the output channels in the Patch Encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input image.
+        image_size (`int`, *optional*, defaults to 1024):
+            Expected resolution. Target size of the resized input image.
+        patch_size (`int`, *optional*, defaults to 16):
+            Size of the patches to be extracted from the input image.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string)
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 1e-10):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to query, key, value projections.
+        use_abs_pos (`bool`, *optional*, defaults to `True`):
+            Whether to use absolute position embedding.
+        use_rel_pos (`bool`, *optional*, defaults to `True`):
+            Whether to use relative position embedding.
+        window_size (`int`, *optional*, defaults to 14):
+            Window size for relative position.
+        global_attn_indexes (`list[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
+            The indexes of the global attention layers.
+        mlp_dim (`int`, *optional*, defaults to 3072):
+            The dimensionality of the MLP layer in the Transformer encoder.
+    """
+
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        output_channels=256,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=1024,
+        patch_size=16,
+        hidden_act="gelu",
+        layer_norm_eps=1e-06,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        qkv_bias=True,
+        use_abs_pos=True,
+        use_rel_pos=True,
+        window_size=14,
+        global_attn_indexes=[2, 5, 8, 11],
+        mlp_dim=3072,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.output_channels = output_channels
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.qkv_bias = qkv_bias
+        self.use_abs_pos = use_abs_pos
+        self.use_rel_pos = use_rel_pos
+        self.window_size = window_size
+        self.global_attn_indexes = global_attn_indexes
+        self.mlp_dim = mlp_dim
+
+
+class GotOcr2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GotOcr2ForConditionalGeneration`]. It is used to instantiate a
+    GotOcr2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of GOT-OCR-2.0.
+
+    e.g [stepfun-ai/GOT-OCR-2.0-hf](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        image_token_index (`int`, *optional*, defaults to 151859):
+            The image token index to encode the image prompt.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
+        pad_token_id (`int`, *optional*, defaults to -1):
+            Padding token id.
+
+    ```python
+    >>> from transformers import GotOcr2ForConditionalGeneration, GotOcr2Config
+
+    >>> # Initializing a GotOcr2 style configuration
+    >>> configuration = GotOcr2Config()
+
+    >>> # Initializing a model from the Qwen2-VL-7B style configuration
+    >>> model = GotOcr2ForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "got_ocr2"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+    }
+    sub_configs = {"text_config": AutoConfig, "vision_config": GotOcr2VisionConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        image_token_index=151859,
+        image_seq_length=576,
+        pad_token_id=-1,
+        **kwargs,
+    ):
+        self.image_token_index = image_token_index
+        self.image_seq_length = image_seq_length
+        self.pad_token_id = pad_token_id
+
+        if vision_config is None:
+            self.vision_config = GotOcr2VisionConfig()
+        elif isinstance(vision_config, dict):
+            self.vision_config = GotOcr2VisionConfig(**vision_config)
+        elif isinstance(vision_config, GotOcr2VisionConfig):
+            self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "qwen2")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["qwen2"](
+                vocab_size=151860,
+                hidden_size=1024,
+                intermediate_size=2816,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                num_key_value_heads=16,
+                hidden_act="silu",
+                max_position_embeddings=32768,
+                initializer_range=0.02,
+                rms_norm_eps=1e-6,
+                use_cache=True,
+                tie_word_embeddings=True,
+                rope_theta=1000000.0,
+                rope_scaling=None,
+                use_sliding_window=False,
+                sliding_window=4096,
+                max_window_layers=21,
+                attention_dropout=0.0,
+            )
+
+        self.text_config = text_config
+
+        super().__init__(**kwargs)
+
+
+class GotOcr2MLPBlock(SamMLPBlock):
+    pass
+
+
+class GotOcr2VisionAttention(SamVisionAttention):
+    pass
+
+
+class GotOcr2VisionLayer(SamVisionLayer):
+    def __init__(self, config, window_size):
+        super().__init__(config, window_size)
+        self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attn = GotOcr2VisionAttention(config, window_size)
+        self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = GotOcr2MLPBlock(config)
+        self.window_size = window_size
+
+
+class GotOcr2PreTrainedModel(SamPreTrainedModel):
+    pass
+
+
+class GotOcr2VisionEncoder(SamVisionEncoder, GotOcr2PreTrainedModel):
+    pass
+
+
+class GotOcr2MultiModalProjector(nn.Module):
+    def __init__(self, config: GotOcr2Config):
+        super().__init__()
+        vision_output_channels = config.vision_config.output_channels
+        language_hidden_size = config.text_config.hidden_size
+        self.conv_upsampler1 = nn.Conv2d(
+            vision_output_channels, vision_output_channels * 2, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.conv_upsampler2 = nn.Conv2d(
+            vision_output_channels * 2, language_hidden_size, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.multimodal_projector = nn.Linear(language_hidden_size, language_hidden_size)
+
+    def forward(self, vision_embeddings: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.conv_upsampler1(vision_embeddings)
+        hidden_state = self.conv_upsampler2(hidden_state)
+        hidden_state = hidden_state.flatten(2).permute(0, 2, 1)
+        hidden_state = self.multimodal_projector(hidden_state)
+        return hidden_state
+
+
+class GotOcr2CausalLMOutputWithPast(LlavaCausalLMOutputWithPast):
+    pass
+
+
+class GotOcr2ModelOutputWithPast(LlavaModelOutputWithPast):
+    pass
+
+
+class GotOcr2PreTrainedModel(LlavaPreTrainedModel):
+    _supports_flash_attn = False
+    _supports_sdpa = False
+    _supports_flex_attn = False
+
+    def _init_weights(self, module):
+        PreTrainedModel._init_weights(self, module)
+        if isinstance(module, GotOcr2VisionAttention):
+            if module.use_rel_pos:
+                module.rel_pos_h.data.zero_()
+                module.rel_pos_w.data.zero_()
+        elif isinstance(module, GotOcr2VisionEncoder):
+            if module.pos_embed is not None:
+                module.pos_embed.data.zero_()
+
+
+class GotOcr2Model(LlavaModel):
+    def __init__(self, config: GotOcr2Config):
+        super().__init__(config)
+        self.vision_tower = GotOcr2VisionEncoder(config.vision_config)
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        image_outputs = self.vision_tower(pixel_values).last_hidden_state
+        return self.multi_modal_projector(image_outputs)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, GotOcr2ModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values=pixel_values.to(inputs_embeds.dtype))
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return GotOcr2ModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+class GotOcr2ForConditionalGeneration(LlavaForConditionalGeneration):
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, GotOcr2CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, GotOcr2ForConditionalGeneration, TextStreamer
+
+        >>> model = GotOcr2ForConditionalGeneration.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf").to("cuda")
+        >>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+
+        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(image, return_tensors="pt", color="green").to("cuda")
+
+        >>> # Generate
+        >>> streamer = TextStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
+        >>> generate_ids = model.generate(
+        ...     **inputs,
+        ...     do_sample=False,
+        ...     tokenizer = processor.tokenizer,
+        ...     stop_strings='<|im_end|>',
+        ...     streamer=streamer,
+        ...     max_new_tokens=4096,
+        ... )
+        "You should keep in mind what features from the module should be used, especially
+        when you're planning to sell a template."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return GotOcr2CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+
+__all__ = [
+    "GotOcr2VisionConfig",
+    "GotOcr2Config",
+    "GotOcr2PreTrainedModel",
+    "GotOcr2Model",
+    "GotOcr2ForConditionalGeneration",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/processing_got_ocr2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/processing_got_ocr2.py
new file mode 100644
index 0000000000000000000000000000000000000000..16c062ec63ade1310971f8797291439b59bad5ac
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/processing_got_ocr2.py
@@ -0,0 +1,261 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Union
+
+import numpy as np
+
+from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...utils import is_vision_available, logging
+
+
+if is_vision_available():
+    from ...image_utils import load_images
+
+logger = logging.get_logger(__name__)
+
+
+class GotOcr2TextKwargs(TextKwargs, total=False):
+    format: Optional[bool]
+
+
+class GotOcr2ImagesKwargs(ImagesKwargs, total=False):
+    box: Optional[Union[list, tuple[float, float], tuple[float, float, float, float]]]
+    color: Optional[str]
+    num_image_tokens: Optional[int]
+    multi_page: Optional[bool]
+    crop_to_patches: Optional[bool]
+    min_patches: Optional[int]
+    max_patches: Optional[int]
+
+
+class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: GotOcr2TextKwargs
+    images_kwargs: GotOcr2ImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "format": False,
+        },
+        "images_kwargs": {
+            "num_image_tokens": 256,
+            "multi_page": False,
+            "crop_to_patches": False,
+            "min_patches": 1,
+            "max_patches": 12,
+        },
+    }
+
+
+def preprocess_box_annotation(box: Union[list, tuple], image_size: tuple[int, int]) -> list:
+    """
+    Convert box annotation to the format [x1, y1, x2, y2] in the range [0, 1000].
+    """
+    width, height = image_size
+    if len(box) == 4:
+        box[0] = int(box[0] / width * 1000)
+        box[1] = int(box[1] / height * 1000)
+        box[2] = int(box[2] / width * 1000)
+        box[3] = int(box[3] / height * 1000)
+    else:
+        raise ValueError("Box must be a list or tuple of lists in the form [x1, y1, x2, y2].")
+
+    return list(box)
+
+
+class GotOcr2Processor(ProcessorMixin):
+    r"""
+    Constructs a GotOcr2 processor which wraps a [`GotOcr2ImageProcessor`] and
+    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
+    tokenizer functionalities. See the [`~GotOcr2Processor.__call__`] and [`~GotOcr2Processor.decode`] for more information.
+    Args:
+        image_processor ([`GotOcr2ImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "PreTrainedTokenizerFast"
+
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+        self.message_start_token = "<|im_start|>"
+        self.message_end_token = "<|im_end|>"
+        self.img_start_token = "<img>"
+        self.img_end_token = "</img>"
+        self.img_pad_token = "<imgpad>"
+        self.image_token = "<imgpad>"  # keep the above for BC, but we need to call it `image_token`
+        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+        self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail."
+
+    def _make_list_of_inputs(self, images, text, box, color, multi_page):
+        if not isinstance(images, (list, tuple)):
+            images = [images]
+            if multi_page:
+                logger.warning("Multi-page inference is enabled but only one image is passed.")
+                images = [images]
+        elif isinstance(images[0], (list, tuple)) and not multi_page:
+            raise ValueError("Nested images are only supported with `multi_page` set to `True`.")
+        elif not isinstance(images[0], (list, tuple)) and multi_page:
+            images = [images]
+
+        if isinstance(text, str):
+            text = [text]
+
+        if not isinstance(box[0], (list, tuple)):
+            # Use the same box for all images
+            box = [box for _ in range(len(images))]
+        if not isinstance(color, (list, tuple)):
+            color = [color for _ in range(len(images))]
+
+        return images, text, box, color
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[GotOcr2ProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
+        is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
+        `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
+        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            format (`bool`, *optional*):
+                If set, will add the format token to the query, and the model will return the OCR result with formatting.
+            box (`list[float]`, `list[tuple[float, float]]`, `list[tuple[float, float, float, float]]`, *optional*):
+                The box annotation to be added to the query. If a list of floats or a tuple of floats is provided, it
+                will be interpreted as [x1, y1, x2, y2]. If a list of tuples is provided, each tuple should be in the
+                form (x1, y1, x2, y2).
+            color (`str`, *optional*):
+                The color annotation to be added to the query. The model will return the OCR result within the box with
+                the specified color.
+            multi_page (`bool`, *optional*):
+                If set, will enable multi-page inference. The model will return the OCR result across multiple pages.
+            crop_to_patches (`bool`, *optional*):
+                If set, will crop the image to patches. The model will return the OCR result upon the patch reference.
+            min_patches (`int`, *optional*):
+                The minimum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to
+                `True`.
+            max_patches (`int`, *optional*):
+                The maximum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to
+                `True`.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+
+        output_kwargs = self._merge_kwargs(
+            GotOcr2ProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        format_output = output_kwargs["text_kwargs"].pop("format")
+        num_image_tokens = output_kwargs["images_kwargs"].pop("num_image_tokens")
+        box = output_kwargs["images_kwargs"].pop("box", [None])
+        color = output_kwargs["images_kwargs"].pop("color", None)
+        multi_page = output_kwargs["images_kwargs"].pop("multi_page")
+
+        crop_to_patches = output_kwargs["images_kwargs"].get("crop_to_patches")
+        images, text, box, color = self._make_list_of_inputs(images, text, box, color, multi_page)
+        if multi_page:
+            # save the number of pages per batch
+            num_pages_per_batch = [len(image_group) for image_group in images]
+            # flatten the list of images
+            images = [image for image_group in images for image in image_group]
+        else:
+            num_pages_per_batch = [1 for _ in range(len(images))]
+        # Load images as we need to know the image size
+        images = load_images(images)
+        image_sizes = [image.size for image in images]
+        image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+        num_patches_array = image_inputs.pop("num_patches")
+        if text is None:
+            text = []
+            patch_indices = np.cumsum(num_pages_per_batch)
+            for index, (num_pages, box_single, color_single) in enumerate(zip(num_pages_per_batch, box, color)):
+                current_patch_index = patch_indices[index - 1] if index > 0 else 0
+                num_patches = sum(num_patches_array[current_patch_index : current_patch_index + num_pages])
+                if box_single[0] is not None:
+                    box_single = preprocess_box_annotation(box_single, image_sizes[index])
+                query = (
+                    f"{f'[{color_single}] ' if color_single is not None else ''}"
+                    f"{str(box_single) if box_single[0] is not None else ''} "
+                    "OCR"
+                    f"{' with format' if format_output else ''}"
+                    f"{' across multi pages' if multi_page else ''}"
+                    f"{' upon the patch reference' if crop_to_patches else ''}"
+                    ": "
+                )
+                prompt = (
+                    self.message_start_token
+                    + self.system_query
+                    + self.message_end_token
+                    + self.message_start_token
+                    + "user\n"
+                    + self.img_start_token
+                    + self.img_pad_token * num_image_tokens * num_patches
+                    + self.img_end_token
+                    + "\n"
+                    + query
+                    + self.message_end_token
+                    + self.message_start_token
+                    + "assistant\n"
+                )
+                text.append(prompt)
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
+
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+
+
+__all__ = ["GotOcr2Processor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f01899e668e3a86548db3f59c7f42d70746385ab
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__init__.py
@@ -0,0 +1,32 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gpt2 import *
+    from .modeling_flax_gpt2 import *
+    from .modeling_gpt2 import *
+    from .modeling_tf_gpt2 import *
+    from .tokenization_gpt2 import *
+    from .tokenization_gpt2_fast import *
+    from .tokenization_gpt2_tf import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/configuration_gpt2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/configuration_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..db5151a2ba15635a7943744799b0689fc96790d3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/configuration_gpt2.py
@@ -0,0 +1,274 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""OpenAI GPT-2 configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import Any, Optional
+
+from ... import PreTrainedTokenizer, TensorType, is_torch_available
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfigWithPast, PatchingSpec
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GPT2Config(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`GPT2Model`] or a [`TFGPT2Model`]. It is used to
+    instantiate a GPT-2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GPT-2
+    [openai-community/gpt2](https://huggingface.co/openai-community/gpt2) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (`int`, *optional*):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu_new"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        summary_type (`string`, *optional*, defaults to `"cls_index"`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+
+            Has to be one of the following options:
+
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            [`GPT2DoubleHeadsModel`].
+
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`] and
+            [`TFGPT2DoubleHeadsModel`].
+
+            The dropout ratio to be used after the projection and activation.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        bos_token_id (`int`, *optional*, defaults to 50256):
+            Id of the beginning of sentence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 50256):
+            Id of the end of sentence token in the vocabulary.
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
+            dot-product/softmax to float() when training with mixed precision.
+
+    Example:
+
+    ```python
+    >>> from transformers import GPT2Config, GPT2Model
+
+    >>> # Initializing a GPT2 configuration
+    >>> configuration = GPT2Config()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = GPT2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gpt2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        summary_type="cls_index",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+
+class GPT2OnnxConfig(OnnxConfigWithPast):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        task: str = "default",
+        patching_specs: Optional[list[PatchingSpec]] = None,
+        use_past: bool = False,
+    ):
+        super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
+        if not getattr(self._config, "pad_token_id", None):
+            # TODO: how to do that better?
+            self._config.pad_token_id = 0
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+            common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
+        else:
+            common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
+
+        return common_inputs
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.n_layer
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self._config.n_head
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
+            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
+        )
+
+        # We need to order the input in the way they appears in the forward()
+        ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
+
+        # Need to add the past_keys
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+
+                batch, seqlen = common_inputs["input_ids"].shape
+                # Not using the same length for past_key_values
+                past_key_values_length = seqlen + 2
+                past_shape = (
+                    batch,
+                    self.num_attention_heads,
+                    past_key_values_length,
+                    self._config.hidden_size // self.num_attention_heads,
+                )
+                ordered_inputs["past_key_values"] = [
+                    (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
+                ]
+
+        ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
+        if self.use_past:
+            mask_dtype = ordered_inputs["attention_mask"].dtype
+            ordered_inputs["attention_mask"] = torch.cat(
+                [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
+            )
+
+        return ordered_inputs
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 13
+
+
+__all__ = ["GPT2Config", "GPT2OnnxConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_flax_gpt2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_flax_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e419217c5a3642ee27f6f3df87e1c27c0d5ac79
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_flax_gpt2.py
@@ -0,0 +1,782 @@
+# coding=utf-8
+# Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Optional
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+)
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_gpt2 import GPT2Config
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "openai-community/gpt2"
+_CONFIG_FOR_DOC = "GPT2Config"
+
+
+GPT2_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`GPT2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+GPT2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        past_key_values (`dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class FlaxConv1D(nn.Module):
+    features: int
+    use_bias: bool = True
+    dtype: Any = jnp.float32
+    precision: Any = None
+
+    @nn.compact
+    def __call__(self, inputs):
+        inputs = jnp.asarray(inputs, self.dtype)
+        kernel = self.param("kernel", jax.nn.initializers.normal(stddev=0.02), (self.features, inputs.shape[-1]))
+        kernel = jnp.asarray(kernel.transpose(), self.dtype)
+        y = lax.dot_general(inputs, kernel, (((inputs.ndim - 1,), (0,)), ((), ())), precision=self.precision)
+        if self.use_bias:
+            bias = self.param("bias", jax.nn.initializers.zeros, (self.features,))
+            bias = jnp.asarray(bias, self.dtype)
+            y = y + bias
+        return y
+
+
+class FlaxGPT2Attention(nn.Module):
+    config: GPT2Config
+    dtype: jnp.dtype = jnp.float32
+    causal: bool = True
+    is_cross_attention: bool = False
+
+    def setup(self):
+        config = self.config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+
+        if self.is_cross_attention:
+            self.c_attn = FlaxConv1D(2 * self.embed_dim, dtype=self.dtype)
+            self.q_attn = FlaxConv1D(self.embed_dim, dtype=self.dtype)
+        else:
+            self.c_attn = FlaxConv1D(3 * self.embed_dim, dtype=self.dtype)
+        self.c_proj = FlaxConv1D(self.embed_dim, dtype=self.dtype)
+
+        self.resid_dropout = nn.Dropout(rate=config.resid_pdrop)
+
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool"
+            )
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slightly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states,
+        key_value_states: Optional[jnp.ndarray] = None,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+
+        if not is_cross_attention:
+            qkv_out = self.c_attn(hidden_states)
+            query, key, value = jnp.split(qkv_out, 3, axis=2)
+        else:
+            q_out = self.q_attn(hidden_states)
+            (query,) = jnp.split(q_out, 1, axis=2)
+            kv_out = self.c_attn(key_value_states)
+            key, value = jnp.split(kv_out, 2, axis=2)
+
+        query = self._split_heads(query)
+        key = self._split_heads(key)
+        value = self._split_heads(value)
+
+        query_length, key_length = query.shape[1], key.shape[1]
+
+        if self.causal:
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+
+        dropout_rng = None
+        if not deterministic and self.config.attn_pdrop > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
+
+        # transform boolean mask into float mask
+        if attention_mask is not None:
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+
+        # usual dot product attention
+        attn_weights = dot_product_attention_weights(
+            query,
+            key,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attn_pdrop,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output, deterministic=deterministic)
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+class FlaxGPT2MLP(nn.Module):
+    config: GPT2Config
+    intermediate_size: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        embed_dim = self.config.hidden_size
+        self.c_fc = FlaxConv1D(self.intermediate_size, dtype=self.dtype)
+        self.c_proj = FlaxConv1D(embed_dim, dtype=self.dtype)
+        self.act = ACT2FN[self.config.activation_function]
+        self.dropout = nn.Dropout(rate=self.config.resid_pdrop)
+
+    def __call__(self, hidden_states, deterministic: bool = True):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+class FlaxGPT2Block(nn.Module):
+    config: GPT2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        hidden_size = self.config.hidden_size
+        inner_dim = self.config.n_inner if self.config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+        self.attn = FlaxGPT2Attention(self.config, dtype=self.dtype)
+        self.ln_2 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+        if self.config.add_cross_attention:
+            self.crossattention = FlaxGPT2Attention(
+                config=self.config, dtype=self.dtype, causal=False, is_cross_attention=True
+            )
+            self.ln_cross_attn = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+        self.mlp = FlaxGPT2MLP(self.config, inner_dim, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+        )
+        # residual connection
+        attn_output = attn_outputs[0]  # output_attn: a, (attentions)
+        outputs = attn_outputs[1:]
+        # residual connection
+        hidden_states = attn_output + residual
+
+        # Cross-Attention Block
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            residual = hidden_states
+            hidden_states = self.ln_cross_attn(hidden_states)
+            cross_attn_outputs = self.crossattention(
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                deterministic=deterministic,
+                output_attentions=output_attentions,
+            )
+            attn_output = cross_attn_outputs[0]
+            # residual connection
+            hidden_states = residual + attn_output
+            outputs = outputs + cross_attn_outputs[1:]  # add cross attentions if we output attention weights
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+
+        outputs = (hidden_states,) + outputs
+
+        return outputs
+
+
+class FlaxGPT2PreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GPT2Config
+    base_model_prefix = "transformer"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: GPT2Config,
+        input_shape: tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        if self.config.add_cross_attention:
+            encoder_hidden_states = jnp.zeros(input_shape + (self.config.n_embd,))
+            encoder_attention_mask = attention_mask
+            module_init_outputs = self.module.init(
+                rngs,
+                input_ids,
+                attention_mask,
+                position_ids,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                return_dict=False,
+            )
+        else:
+            module_init_outputs = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)
+
+        random_params = module_init_outputs["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length))
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        params: Optional[dict] = None,
+        past_key_values: Optional[dict] = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        if encoder_hidden_states is not None and encoder_attention_mask is None:
+            batch_size, sequence_length = encoder_hidden_states.shape[:2]
+            encoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+        batch_size, sequence_length = input_ids.shape
+
+        if position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
+
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        if attention_mask is None:
+            attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxGPT2Attention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        outputs = self.module.apply(
+            inputs,
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            jnp.array(position_ids, dtype="i4"),
+            encoder_hidden_states,
+            encoder_attention_mask,
+            not train,
+            False,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=mutable,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past_key_values = outputs
+            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past_key_values = outputs
+            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        return outputs
+
+
+class FlaxGPT2BlockCollection(nn.Module):
+    config: GPT2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.blocks = [
+            FlaxGPT2Block(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for block in self.blocks:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = block(
+                hidden_states,
+                attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                deterministic=deterministic,
+                init_cache=init_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # this contains possible `None` values - `FlaxGPT2Module` will filter them out
+        outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions)
+
+        return outputs
+
+
+class FlaxGPT2Module(nn.Module):
+    config: GPT2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.embed_dim = self.config.hidden_size
+
+        self.wte = nn.Embed(
+            self.config.vocab_size,
+            self.embed_dim,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.wpe = nn.Embed(
+            self.config.max_position_embeddings,
+            self.embed_dim,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.embd_pdrop)
+        self.h = FlaxGPT2BlockCollection(self.config, dtype=self.dtype)
+        self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic=True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        input_embeds = self.wte(input_ids.astype("i4"))
+        position_embeds = self.wpe(position_ids.astype("i4"))
+
+        hidden_states = input_embeds + position_embeds
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+
+        outputs = self.h(
+            hidden_states,
+            attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = outputs[1] + (hidden_states,)
+            outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=outputs[1],
+            attentions=outputs[2],
+            cross_attentions=outputs[3],
+        )
+
+
+@add_start_docstrings(
+    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT2_START_DOCSTRING,
+)
+class FlaxGPT2Model(FlaxGPT2PreTrainedModel):
+    module_class = FlaxGPT2Module
+
+
+append_call_sample_docstring(
+    FlaxGPT2Model,
+    _CHECKPOINT_FOR_DOC,
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    _CONFIG_FOR_DOC,
+)
+
+
+class FlaxGPT2LMHeadModule(nn.Module):
+    config: GPT2Config
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.transformer = FlaxGPT2Module(self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        outputs = self.transformer(
+            input_ids,
+            attention_mask,
+            position_ids,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        if self.config.tie_word_embeddings:
+            shared_kernel = self.transformer.variables["params"]["wte"]["embedding"].T
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
+
+        return FlaxCausalLMOutputWithCrossAttentions(
+            logits=lm_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    GPT2_START_DOCSTRING,
+)
+class FlaxGPT2LMHeadModel(FlaxGPT2PreTrainedModel):
+    module_class = FlaxGPT2LMHeadModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since GPT2 uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(
+                extended_attention_mask, attention_mask.astype("i4"), (0, 0)
+            )
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+append_call_sample_docstring(
+    FlaxGPT2LMHeadModel,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutputWithCrossAttentions,
+    _CONFIG_FOR_DOC,
+)
+
+
+__all__ = ["FlaxGPT2LMHeadModel", "FlaxGPT2Model", "FlaxGPT2PreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_gpt2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae0786179464115b880ab5d5b4c771292ad5b2db
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_gpt2.py
@@ -0,0 +1,1638 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI GPT-2 model."""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN, get_activation
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...masking_utils import create_causal_mask
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    auto_docstring,
+    logging,
+)
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.model_parallel_utils import assert_device_map, get_device_map
+from .configuration_gpt2 import GPT2Config
+
+
+logger = logging.get_logger(__name__)
+
+
+def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
+    """Load tf checkpoints in a pytorch model"""
+    try:
+        import re
+
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(gpt2_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array.squeeze())
+
+    for name, array in zip(names, arrays):
+        name = name[6:]  # skip "model/"
+        name = name.split("/")
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
+                scope_names = re.split(r"(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "w" or scope_names[0] == "g":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "b":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
+                pointer = getattr(pointer, scope_names[0])
+                pointer = getattr(pointer, "weight")
+            else:
+                pointer = getattr(pointer, scope_names[0])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except ValueError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def eager_attention_forward(module, query, key, value, attention_mask, head_mask=None, **kwargs):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2))
+
+    if module.scale_attn_weights:
+        attn_weights = attn_weights / torch.full(
+            [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
+        )
+
+    # Layer-wise attention scaling
+    if module.scale_attn_by_inverse_layer_idx:
+        attn_weights = attn_weights / float(module.layer_idx + 1)
+
+    if not module.is_cross_attention:
+        # if only "normal" attention layer implements causal mask
+        query_length, key_length = query.size(-2), key.size(-2)
+        causal_mask = module.bias[:, :, key_length - query_length : key_length, :key_length]
+        mask_value = torch.finfo(attn_weights.dtype).min
+        # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+        # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+        mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+        attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
+
+    if attention_mask is not None:
+        # Apply the attention mask
+        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+    # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
+    attn_weights = attn_weights.type(value.dtype)
+    attn_weights = module.attn_dropout(attn_weights)
+
+    # Mask heads if we want to
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2)
+
+    return attn_output, attn_weights
+
+
+class GPT2Attention(nn.Module):
+    def __init__(self, config, is_cross_attention=False, layer_idx=None):
+        super().__init__()
+        self.config = config
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
+                1, 1, max_positions, max_positions
+            ),
+            persistent=False,
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.split_size = self.embed_dim
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        self.scale_attn_weights = config.scale_attn_weights
+        self.is_cross_attention = is_cross_attention
+
+        # Layer-wise attention scaling, reordering, and upcasting
+        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
+        self.layer_idx = layer_idx
+        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
+
+        if self.is_cross_attention:
+            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
+            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+        else:
+            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
+
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.is_causal = True
+
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
+
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+
+        # Update hyper params
+        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
+        self.num_heads = self.num_heads - len(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
+        bsz, num_heads, q_seq_len, dk = query.size()
+        _, _, k_seq_len, _ = key.size()
+
+        # Preallocate attn_weights for `baddbmm`
+        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
+
+        # Compute Scale Factor
+        scale_factor = 1.0
+        if self.scale_attn_weights:
+            scale_factor /= float(value.size(-1)) ** 0.5
+
+        if self.scale_attn_by_inverse_layer_idx:
+            scale_factor /= float(self.layer_idx + 1)
+
+        # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
+        with torch.autocast(query.device.type, enabled=False):
+            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
+            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
+            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
+        if attn_weights.dtype != torch.float32:
+            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+        attn_output = attn_output.transpose(1, 2)
+
+        return attn_output, attn_weights
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.FloatTensor]],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> tuple[Union[torch.Tensor, tuple[torch.Tensor]], ...]:
+        is_cross_attention = encoder_hidden_states is not None
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_layer from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        if is_cross_attention:
+            if not hasattr(self, "q_attn"):
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
+                )
+            query_states = self.q_attn(hidden_states)
+            attention_mask = encoder_attention_mask
+
+            # Try to get key/value states from cache if possible
+            if past_key_values is not None and is_updated:
+                key_states = curr_past_key_value.layers[self.layer_idx].keys
+                value_states = curr_past_key_value.layers[self.layer_idx].values
+            else:
+                key_states, value_states = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
+                shape_kv = (*key_states.shape[:-1], -1, self.head_dim)
+                key_states = key_states.view(shape_kv).transpose(1, 2)
+                value_states = value_states.view(shape_kv).transpose(1, 2)
+        else:
+            query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2)
+            shape_kv = (*key_states.shape[:-1], -1, self.head_dim)
+            key_states = key_states.view(shape_kv).transpose(1, 2)
+            value_states = value_states.view(shape_kv).transpose(1, 2)
+
+        shape_q = (*query_states.shape[:-1], -1, self.head_dim)
+        query_states = query_states.view(shape_q).transpose(1, 2)
+
+        if (past_key_values is not None and not is_cross_attention) or (
+            past_key_values is not None and is_cross_attention and not is_updated
+        ):
+            # save all key/value_layer to cache to be re-used for fast auto-regressive generation
+            cache_position = cache_position if not is_cross_attention else None
+            key_states, value_states = curr_past_key_value.update(
+                key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+            )
+            # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+            if is_cross_attention:
+                past_key_values.is_updated[self.layer_idx] = True
+
+        is_causal = attention_mask is None and query_states.shape[-2] > 1 and not is_cross_attention
+
+        using_eager = self.config._attn_implementation == "eager"
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        if using_eager and self.reorder_and_upcast_attn:
+            attn_output, attn_weights = self._upcast_and_reordered_attn(
+                query_states, key_states, value_states, attention_mask, head_mask
+            )
+        else:
+            attn_output, attn_weights = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask,
+                head_mask=head_mask,
+                dropout=self.attn_dropout.p if self.training else 0.0,
+                is_causal=is_causal,
+                **kwargs,
+            )
+
+        attn_output = attn_output.reshape(*attn_output.shape[:-2], -1).contiguous()
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        return attn_output, attn_weights
+
+
+class GPT2MLP(nn.Module):
+    def __init__(self, intermediate_size, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = Conv1D(intermediate_size, embed_dim)
+        self.c_proj = Conv1D(embed_dim, intermediate_size)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, hidden_states: Optional[tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class GPT2Block(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = GPT2Attention(config=config, layer_idx=layer_idx)
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        if config.add_cross_attention:
+            self.crossattention = GPT2Attention(config=config, is_cross_attention=True, layer_idx=layer_idx)
+            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        self.mlp = GPT2MLP(inner_dim, config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.FloatTensor]],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], Optional[tuple[torch.Tensor, tuple[torch.FloatTensor, ...]]]]:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output, self_attn_weights = self.attn(
+            hidden_states,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+        # residual connection
+        hidden_states = attn_output + residual
+
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            residual = hidden_states
+            hidden_states = self.ln_cross_attn(hidden_states)
+            cross_attn_output, cross_attn_weights = self.crossattention(
+                hidden_states,
+                past_key_values=past_key_values,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+            # residual connection
+            hidden_states = residual + cross_attn_output
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+            if encoder_hidden_states is not None:
+                outputs += (cross_attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.xlm.modeling_xlm.XLMSequenceSummary with XLM->GPT2
+class GPT2SequenceSummary(nn.Module):
+    r"""
+    Compute a single vector summary of a sequence hidden states.
+
+    Args:
+        config ([`GPT2Config`]):
+            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
+            config class of your model for the default values it uses):
+
+            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
+
+                - `"last"` -- Take the last token hidden state (like XLNet)
+                - `"first"` -- Take the first token hidden state (like Bert)
+                - `"mean"` -- Take the mean of all tokens hidden states
+                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+                - `"attn"` -- Not implemented now, use multi-head attention
+
+            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
+              (otherwise to `config.hidden_size`).
+            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
+              another string or `None` will add no activation.
+            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
+            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
+    """
+
+    def __init__(self, config: GPT2Config):
+        super().__init__()
+
+        self.summary_type = getattr(config, "summary_type", "last")
+        if self.summary_type == "attn":
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.summary = nn.Identity()
+        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+
+        activation_string = getattr(config, "summary_activation", None)
+        self.activation: Callable = get_activation(activation_string) if activation_string else nn.Identity()
+
+        self.first_dropout = nn.Identity()
+        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = nn.Identity()
+        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None
+    ) -> torch.FloatTensor:
+        """
+        Compute a single vector summary of a sequence hidden states.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
+                The hidden states of the last layer.
+            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
+                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token.
+
+        Returns:
+            `torch.FloatTensor`: The summary of the sequence hidden states.
+        """
+        if self.summary_type == "last":
+            output = hidden_states[:, -1]
+        elif self.summary_type == "first":
+            output = hidden_states[:, 0]
+        elif self.summary_type == "mean":
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == "cls_index":
+            if cls_index is None:
+                cls_index = torch.full_like(
+                    hidden_states[..., :1, :],
+                    hidden_states.shape[-2] - 1,
+                    dtype=torch.long,
+                )
+            else:
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+        elif self.summary_type == "attn":
+            raise NotImplementedError
+
+        output = self.first_dropout(output)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output)
+
+        return output
+
+
+@auto_docstring
+class GPT2PreTrainedModel(PreTrainedModel):
+    config: GPT2Config
+    load_tf_weights = load_tf_weights_in_gpt2
+    base_model_prefix = "transformer"
+    is_parallelizable = True
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GPT2Block"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_attention_backend = True
+
+    _can_compile_fullgraph = True
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, Conv1D)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if name == "c_proj.weight":
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of models predicting if two sentences are consecutive or not.
+    """
+)
+class GPT2DoubleHeadsModelOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss.
+    mc_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mc_labels` is provided):
+        Multiple choice classification loss.
+    logits (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    mc_logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`):
+        Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    mc_loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    mc_logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+PARALLELIZE_DOCSTRING = r"""
+    This is an experimental feature and is a subject to change at a moment's notice.
+
+    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
+    it will evenly distribute blocks across all devices.
+
+    Args:
+        device_map (`dict[int, list]`, *optional*):
+            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
+            automatically mapped to the first device (for esoteric reasons). That means that the first device should
+            have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
+            following number of attention modules:
+
+                - openai-community/gpt2: 12
+                - openai-community/gpt2-medium: 24
+                - openai-community/gpt2-large: 36
+                - openai-community/gpt2-xl: 48
+
+    Example:
+
+    ```python
+    # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
+    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-xl")
+    device_map = {
+        0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
+        1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
+        2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
+        3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
+    }
+    model.parallelize(device_map)
+    ```
+"""
+DEPARALLELIZE_DOCSTRING = r"""
+    Moves the model to cpu from a model parallel state.
+
+    Example:
+
+    ```python
+    # On a 4 GPU machine with openai-community/gpt2-large:
+    model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2-large")
+    device_map = {
+        0: [0, 1, 2, 3, 4, 5, 6, 7],
+        1: [8, 9, 10, 11, 12, 13, 14, 15],
+        2: [16, 17, 18, 19, 20, 21, 22, 23],
+        3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
+    }
+    model.parallelize(device_map)  # Splits the model across several devices
+    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+    ```
+"""
+
+
+@auto_docstring
+class GPT2Model(GPT2PreTrainedModel):
+    _supports_param_buffer_assignment = False
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+        self._attn_implementation = config._attn_implementation
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        # Check validity of device_map
+        warnings.warn(
+            "`GPT2Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your"
+            " model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1,"
+            " ...}",
+            FutureWarning,
+        )
+        self.device_map = (
+            get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
+        )
+        assert_device_map(self.device_map, len(self.h))
+        self.model_parallel = True
+        self.first_device = "cpu" if "cpu" in self.device_map else "cuda:" + str(min(self.device_map.keys()))
+        self.last_device = "cuda:" + str(max(self.device_map.keys()))
+        self.wte = self.wte.to(self.first_device)
+        self.wpe = self.wpe.to(self.first_device)
+        # Load onto devices
+        for k, v in self.device_map.items():
+            for block in v:
+                cuda_device = "cuda:" + str(k)
+                self.h[block] = self.h[block].to(cuda_device)
+        # ln_f to last
+        self.ln_f = self.ln_f.to(self.last_device)
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.model_parallel = False
+        self.device_map = None
+        self.first_device = "cpu"
+        self.last_device = "cpu"
+        self.wte = self.wte.to("cpu")
+        self.wpe = self.wpe.to("cpu")
+        for index in range(len(self.h)):
+            self.h[index] = self.h[index].to("cpu")
+        self.ln_f = self.ln_f.to("cpu")
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # based on pattern from src/transformers/models/whisper/modeling_whisper.py::WhisperDecoder
+        if use_cache:
+            if past_key_values is None:
+                past_key_values = DynamicCache(config=self.config)
+            elif isinstance(past_key_values, tuple):
+                logger.warning_once(
+                    "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.53.0. "
+                    "You should pass an instance of `Cache` instead, e.g. "
+                    "`past_key_values=DynamicCache.from_legacy_cache(past_key_values)`."
+                )
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+
+            if self.config.add_cross_attention and not isinstance(past_key_values, EncoderDecoderCache):
+                past_key_values = EncoderDecoderCache(past_key_values, DynamicCache(config=self.config))
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds.to(inputs_embeds.device)
+
+        # Attention mask.
+        # ._update_causal_mask() and ._prepare_4d_causal_attention_mask_with_cache_position() copied from LlamaModel
+        if attention_mask is not None and attention_mask.ndim < 4:
+            attention_mask = attention_mask.view(batch_size, -1)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        _use_sdpa = self._attn_implementation == "sdpa" and output_attentions is False and head_mask is None
+        if self.config.add_cross_attention and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            if _use_sdpa:
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    mask=encoder_attention_mask, dtype=inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+            elif self._attn_implementation != "flash_attention_2":
+                encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+
+        hidden_states = self.drop(hidden_states)
+
+        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
+
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, block in enumerate(self.h):
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                if isinstance(head_mask, torch.Tensor):
+                    head_mask = head_mask.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(
+                hidden_states,
+                past_key_values if not (self.gradient_checkpointing and self.training) else None,
+                cache_position,
+                causal_mask,
+                head_mask[i],
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                **kwargs,
+            )
+
+            hidden_states = outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (outputs[2],)
+
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        past_key_values = past_key_values if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions, all_cross_attentions]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """
+)
+class GPT2LMHeadModel(GPT2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = GPT2Model(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        warnings.warn(
+            "`GPT2LMHeadModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
+            " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0':"
+            " 0, 'transformer.h.1': 1, ...}",
+            FutureWarning,
+        )
+        self.device_map = (
+            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.transformer.h))
+        self.transformer.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.transformer.first_device)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.transformer.deparallelize()
+        self.transformer = self.transformer.to("cpu")
+        self.lm_head = self.lm_head.to("cpu")
+        self.model_parallel = False
+        torch.cuda.empty_cache()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
+
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            # Flatten the tokens
+            loss = self.loss_function(
+                logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+        The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
+    RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
+    input embeddings, the classification head takes as input the input of a specified classification token index in the
+    input sequence).
+    """
+)
+class GPT2DoubleHeadsModel(GPT2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        config.num_labels = 1
+        self.transformer = GPT2Model(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.multiple_choice_head = GPT2SequenceSummary(config)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        warnings.warn(
+            "`GPT2DoubleHeadsModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should"
+            " load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your"
+            " own `device_map` but it needs to be a dictionary module_name to device, so for instance"
+            " {'transformer.h.0': 0, 'transformer.h.1': 1, ...}",
+            FutureWarning,
+        )
+        self.device_map = (
+            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.transformer.h))
+        self.transformer.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.transformer.first_device)
+        self.multiple_choice_head = self.multiple_choice_head.to(self.transformer.first_device)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.transformer.deparallelize()
+        self.transformer = self.transformer.to("cpu")
+        self.lm_head = self.lm_head.to("cpu")
+        self.multiple_choice_head = self.multiple_choice_head.to("cpu")
+        self.model_parallel = False
+        torch.cuda.empty_cache()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        mc_token_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        mc_labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, GPT2DoubleHeadsModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        mc_token_ids (`torch.LongTensor` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
+            Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -
+            1]`.
+        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids`. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. All labels set to
+            `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size - 1]`
+        mc_labels (`torch.LongTensor` of shape `(batch_size)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
+            where *num_choices* is the size of the second dimension of the input tensors. (see *input_ids* above)
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, GPT2DoubleHeadsModel
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = GPT2DoubleHeadsModel.from_pretrained("openai-community/gpt2")
+
+        >>> # Add a [CLS] to the vocabulary (we should train it also!)
+        >>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
+        >>> # Update the model embeddings with the new vocabulary size
+        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))
+
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+
+        >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
+        >>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
+
+        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+        >>> lm_logits = outputs.logits
+        >>> mc_logits = outputs.mc_logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
+
+        lm_logits = self.lm_head(hidden_states)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
+
+        mc_loss = None
+        if mc_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
+        lm_loss = None
+        if labels is not None:
+            labels = labels.to(lm_logits.device)
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits, mc_logits) + transformer_outputs[1:]
+            if mc_loss is not None:
+                output = (mc_loss,) + output
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return GPT2DoubleHeadsModelOutput(
+            loss=lm_loss,
+            mc_loss=mc_loss,
+            logits=lm_logits,
+            mc_logits=mc_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The GPT2 Model transformer with a sequence classification head on top (linear layer).
+
+    [`GPT2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """
+)
+class GPT2ForSequenceClassification(GPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPT2Model(config)
+        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size, sequence_length = input_ids.shape[:2]
+        else:
+            batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring
+class GPT2ForTokenClassification(GPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = GPT2Model(config)
+        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+            classifier_dropout = config.classifier_dropout
+        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@auto_docstring
+class GPT2ForQuestionAnswering(GPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPT2Model(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, QuestionAnsweringModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1).to(start_logits.device)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1).to(end_logits.device)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "GPT2DoubleHeadsModel",
+    "GPT2ForQuestionAnswering",
+    "GPT2ForSequenceClassification",
+    "GPT2ForTokenClassification",
+    "GPT2LMHeadModel",
+    "GPT2Model",
+    "GPT2PreTrainedModel",
+    "load_tf_weights_in_gpt2",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..42e23fc290151f09d47a30efca1cb7f4e4a3d669
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -0,0 +1,1238 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 OpenAI GPT-2 model."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithPastAndCrossAttentions,
+    TFCausalLMOutputWithCrossAttentions,
+    TFSequenceClassifierOutputWithPast,
+)
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFConv1D,
+    TFModelInputType,
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    TFSequenceSummary,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_gpt2 import GPT2Config
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "openai-community/gpt2"
+_CONFIG_FOR_DOC = "GPT2Config"
+
+
+class TFAttention(keras.layers.Layer):
+    def __init__(self, nx, config, scale=False, is_cross_attention=False, **kwargs):
+        super().__init__(**kwargs)
+
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implementation]
+        assert n_state % config.n_head == 0
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+        self.output_attentions = config.output_attentions
+
+        self.is_cross_attention = is_cross_attention
+
+        if self.is_cross_attention:
+            self.c_attn = TFConv1D(n_state * 2, nx, initializer_range=config.initializer_range, name="c_attn")
+            self.q_attn = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="q_attn")
+        else:
+            self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name="c_attn")
+
+        self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_proj")
+        self.attn_dropout = keras.layers.Dropout(config.attn_pdrop)
+        self.resid_dropout = keras.layers.Dropout(config.resid_pdrop)
+        self.pruned_heads = set()
+        self.embed_dim = n_state
+
+    def prune_heads(self, heads):
+        pass
+
+    @staticmethod
+    def causal_attention_mask(nd, ns, dtype):
+        """
+        1's in the lower triangle, counting from the lower right corner. Same as tf.matrix_band_part(tf.ones([nd, ns]),
+        -1, ns-nd), but doesn't produce garbage on TPUs.
+        """
+        i = tf.range(nd)[:, None]
+        j = tf.range(ns)
+        m = i >= j - ns + nd
+        return tf.cast(m, dtype)
+
+    def _attn(self, q, k, v, attention_mask, head_mask, output_attentions, training=False):
+        # q, k, v have shape [batch, heads, sequence, features]
+        w = tf.matmul(q, k, transpose_b=True)
+        if self.scale:
+            dk = tf.cast(shape_list(k)[-1], dtype=w.dtype)  # scale attention_scores
+            w = w / tf.math.sqrt(dk)
+
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+
+            # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
+            _, _, nd, ns = shape_list(w)
+            b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
+            b = tf.reshape(b, [1, 1, nd, ns])
+            w = w * b - 1e4 * (1 - b)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attention_mask = tf.cast(attention_mask, dtype=w.dtype)
+            w = w + attention_mask
+
+        w = stable_softmax(w, axis=-1)
+        w = self.attn_dropout(w, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+
+        outputs = [tf.matmul(w, v)]
+        if output_attentions:
+            outputs.append(w)
+        return outputs
+
+    def merge_heads(self, x):
+        x = tf.transpose(x, [0, 2, 1, 3])
+        x_shape = shape_list(x)
+        new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
+        return tf.reshape(x, new_x_shape)
+
+    def split_heads(self, x):
+        x_shape = shape_list(x)
+        new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
+        x = tf.reshape(x, new_x_shape)
+        return tf.transpose(x, (0, 2, 1, 3))  # (batch, head, seq_length, head_features)
+
+    def call(
+        self,
+        x,
+        layer_past,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        use_cache,
+        output_attentions,
+        training=False,
+    ):
+        if encoder_hidden_states is not None:
+            if not hasattr(self, "q_attn"):
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
+                )
+
+            query = self.q_attn(x)
+            kv_out = self.c_attn(encoder_hidden_states)
+            key, value = tf.split(kv_out, 2, axis=2)
+            attention_mask = encoder_attention_mask
+        else:
+            x = self.c_attn(x)
+            query, key, value = tf.split(x, 3, axis=2)
+
+        query = self.split_heads(query)
+        key = self.split_heads(key)
+        value = self.split_heads(value)
+        if layer_past is not None:
+            past_key, past_value = tf.unstack(layer_past, axis=0, num=2)
+            key = tf.concat([past_key, key], axis=-2)
+            value = tf.concat([past_value, value], axis=-2)
+
+        # to cope with keras serialization
+        if use_cache:
+            present = tf.stack([key, value], axis=0)
+        else:
+            present = (None,)
+
+        attn_outputs = self._attn(query, key, value, attention_mask, head_mask, output_attentions, training=training)
+        a = attn_outputs[0]
+
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a, training=training)
+
+        outputs = [a, present] + attn_outputs[1:]
+        return outputs  # a, present, (attentions)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if self.is_cross_attention:
+            c_attn_shape = 2 * self.embed_dim
+        else:
+            c_attn_shape = 3 * self.embed_dim
+        if getattr(self, "c_proj", None) is not None:
+            with tf.name_scope(self.c_proj.name):
+                self.c_proj.build([None, None, self.embed_dim])
+        if getattr(self, "c_attn", None) is not None:
+            with tf.name_scope(self.c_attn.name):
+                self.c_attn.build([None, None, c_attn_shape])
+        if getattr(self, "q_attn", None) is not None:
+            with tf.name_scope(self.q_attn.name):
+                self.q_attn.build([None, None, self.embed_dim])
+
+
+class TFMLP(keras.layers.Layer):
+    def __init__(self, n_state, config, **kwargs):
+        super().__init__(**kwargs)
+        nx = config.n_embd
+        self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name="c_fc")
+        self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name="c_proj")
+        self.act = get_tf_activation(config.activation_function)
+        self.dropout = keras.layers.Dropout(config.resid_pdrop)
+        self.intermediate_size = n_state
+        self.embed_dim = nx
+
+    def call(self, x, training=False):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        h2 = self.dropout(h2, training=training)
+        return h2
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "c_fc", None) is not None:
+            with tf.name_scope(self.c_fc.name):
+                self.c_fc.build([None, None, self.intermediate_size])
+        if getattr(self, "c_proj", None) is not None:
+            with tf.name_scope(self.c_proj.name):
+                self.c_proj.build([None, None, self.embed_dim])
+
+
+class TFBlock(keras.layers.Layer):
+    def __init__(self, config, scale=False, **kwargs):
+        super().__init__(**kwargs)
+        nx = config.n_embd
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * nx
+        self.ln_1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1")
+        self.attn = TFAttention(nx, config, scale, name="attn")
+        self.ln_2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2")
+
+        if config.add_cross_attention:
+            self.crossattention = TFAttention(nx, config, scale, name="crossattention", is_cross_attention=True)
+            self.ln_cross_attn = keras.layers.LayerNormalization(
+                epsilon=config.layer_norm_epsilon, name="ln_cross_attn"
+            )
+
+        self.mlp = TFMLP(inner_dim, config, name="mlp")
+        self.hidden_size = config.hidden_size
+
+    def call(
+        self,
+        x,
+        layer_past,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        use_cache,
+        output_attentions,
+        training=False,
+    ):
+        a = self.ln_1(x)
+        output_attn = self.attn(
+            a,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        a = output_attn[0]  # output_attn: a, present, (attentions)
+        outputs = output_attn[1:]
+        x = x + a
+
+        # Cross-Attention Block
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+
+            ca = self.ln_cross_attn(x)
+            output_cross_attn = self.crossattention(
+                ca,
+                layer_past=None,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                use_cache=False,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            ca = output_cross_attn[0]  # output_attn: a, present, (cross_attentions)
+            x = x + ca
+            outputs = outputs + output_cross_attn[2:]  # add cross attentions if we output attention weights
+
+        m = self.ln_2(x)
+        m = self.mlp(m, training=training)
+        x = x + m
+
+        outputs = [x] + outputs
+        return outputs  # x, present, (attentions, cross_attentions)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "ln_1", None) is not None:
+            with tf.name_scope(self.ln_1.name):
+                self.ln_1.build([None, None, self.hidden_size])
+        if getattr(self, "attn", None) is not None:
+            with tf.name_scope(self.attn.name):
+                self.attn.build(None)
+        if getattr(self, "ln_2", None) is not None:
+            with tf.name_scope(self.ln_2.name):
+                self.ln_2.build([None, None, self.hidden_size])
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+        if getattr(self, "crossattention", None) is not None:
+            with tf.name_scope(self.crossattention.name):
+                self.crossattention.build(None)
+        if getattr(self, "ln_cross_attn", None) is not None:
+            with tf.name_scope(self.ln_cross_attn.name):
+                self.ln_cross_attn.build([None, None, self.hidden_size])
+
+
+@keras_serializable
+class TFGPT2MainLayer(keras.layers.Layer):
+    config_class = GPT2Config
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+        self.config = config
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.use_cache = config.use_cache
+        self.return_dict = config.use_return_dict
+
+        self.num_hidden_layers = config.n_layer
+        self.n_embd = config.n_embd
+        self.n_positions = config.n_positions
+        self.initializer_range = config.initializer_range
+
+        self.wte = keras.layers.Embedding(
+            input_dim=config.vocab_size,
+            output_dim=config.hidden_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="wte",
+        )
+        self.wpe = keras.layers.Embedding(
+            input_dim=config.n_positions,
+            output_dim=config.n_embd,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="wpe",
+        )
+        self.drop = keras.layers.Dropout(config.embd_pdrop)
+        self.h = [TFBlock(config, scale=True, name=f"h_._{i}") for i in range(config.n_layer)]
+        self.ln_f = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_f")
+        self.embed_dim = config.hidden_size
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        raise NotImplementedError
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]:
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+            input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = [None] * len(self.h)
+        else:
+            past_length = shape_list(past_key_values[0][0])[-2]
+
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(past_length, input_shape[-1] + past_length), axis=0)
+
+        if attention_mask is not None:
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask_shape = shape_list(attention_mask)
+            attention_mask = tf.reshape(attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1]))
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            one_cst = tf.constant(1.0)
+            attention_mask = tf.cast(attention_mask, dtype=one_cst.dtype)
+            attention_mask = tf.multiply(tf.subtract(one_cst, attention_mask), tf.constant(-10000.0))
+
+        # Copied from `modeling_tf_t5.py` with -1e9 -> -10000
+        if self.config.add_cross_attention and encoder_attention_mask is not None:
+            # If a 2D ou 3D attention mask is provided for the cross-attention
+            # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+            # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=encoder_hidden_states.dtype)
+            num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
+            if num_dims_encoder_attention_mask == 3:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+            if num_dims_encoder_attention_mask == 2:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+
+            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+            # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
+            #                                         tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
+
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
+        else:
+            encoder_extended_attention_mask = None
+
+        encoder_attention_mask = encoder_extended_attention_mask
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+            # head_mask = tf.constant([0] * self.num_hidden_layers)
+
+        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = self.wte(input_ids)
+
+        position_embeds = self.wpe(position_ids)
+
+        if token_type_ids is not None:
+            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
+            token_type_embeds = self.wte(token_type_ids)
+        else:
+            token_type_embeds = tf.constant(0.0)
+
+        position_embeds = tf.cast(position_embeds, dtype=inputs_embeds.dtype)
+        token_type_embeds = tf.cast(token_type_embeds, dtype=inputs_embeds.dtype)
+        hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        hidden_states = self.drop(hidden_states, training=training)
+
+        output_shape = input_shape + [shape_list(hidden_states)[-1]]
+
+        presents = () if use_cache else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
+
+            outputs = block(
+                hidden_states,
+                layer_past,
+                attention_mask,
+                head_mask[i],
+                encoder_hidden_states,
+                encoder_attention_mask,
+                use_cache,
+                output_attentions,
+                training=training,
+            )
+
+            hidden_states, present = outputs[:2]
+            if use_cache:
+                presents = presents + (present,)
+
+            if output_attentions:
+                all_attentions = all_attentions + (outputs[2],)
+                if self.config.add_cross_attention and encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (outputs[3],)
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = tf.reshape(hidden_states, output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if output_attentions:
+            # let the number of heads free (-1) so we can extract attention even after head pruning
+            attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
+            all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, presents, all_hidden_states, all_attentions, all_cross_attentions]
+                if v is not None
+            )
+
+        return TFBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "wte", None) is not None:
+            with tf.name_scope(self.wte.name):
+                self.wte.build(None)
+        if getattr(self, "wpe", None) is not None:
+            with tf.name_scope(self.wpe.name):
+                self.wpe.build(None)
+        if getattr(self, "ln_f", None) is not None:
+            with tf.name_scope(self.ln_f.name):
+                self.ln_f.build([None, None, self.embed_dim])
+        if getattr(self, "h", None) is not None:
+            for layer in self.h:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFGPT2PreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GPT2Config
+    base_model_prefix = "transformer"
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"h.\d+.attn.bias", r"h.\d+.crossattention.bias"]
+
+    @property
+    def input_signature(self):
+        # Although GPT-2 supports token_type_ids in theory, in practice they are rarely used, and the implementation
+        # means that passing token_type_ids=0 yields different outputs from token_type_ids=None.
+        # Therefore, we remove the token_type_ids argument by default, even though it would usually be included.
+        return {
+            "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
+            "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
+        }
+
+
+@dataclass
+class TFGPT2DoubleHeadsModelOutput(ModelOutput):
+    """
+    Base class for outputs of models predicting if two sentences are consecutive or not.
+
+    Args:
+        logits (`tf.Tensor` of shape `(batch_size, num_choices, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        mc_logits (`tf.Tensor` of shape `(batch_size, num_choices)`):
+            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
+        past_key_values (`list[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads,
+            sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    logits: tf.Tensor | None = None
+    mc_logits: tf.Tensor | None = None
+    past_key_values: list[tf.Tensor] | None = None
+    hidden_states: tuple[tf.Tensor] | None = None
+    attentions: tuple[tf.Tensor] | None = None
+
+
+GPT2_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`GPT2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+GPT2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        past_key_values (`list[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `past_key_values` output below). Can be used to speed up sequential decoding. The token ids which have
+            their past given to this model should not be passed as input ids as they have already been computed.
+        attention_mask (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            If `past_key_values` is used, `attention_mask` needs to contain the masking strategy that was used for
+            `past_key_values`. In other words, the `attention_mask` always has to have the length:
+            `len(past_key_values) + len(input_ids)`
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`tf.Tensor` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT2_START_DOCSTRING,
+)
+class TFGPT2Model(TFGPT2PreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have
+            their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past`). Set to `False` during training, `True` during generation
+        """
+
+        outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+
+
+@add_start_docstrings(
+    """
+    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    GPT2_START_DOCSTRING,
+)
+class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
+
+    def get_output_embeddings(self):
+        return self.get_input_embeddings()
+
+    def set_output_embeddings(self, value):
+        self.set_input_embeddings(value)
+
+    def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
+        token_type_ids = kwargs.get("token_type_ids")
+        # only last token for inputs_ids if past is defined in kwargs
+        if past_key_values:
+            inputs = tf.expand_dims(inputs[:, -1], -1)
+            if token_type_ids is not None:
+                token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1)
+
+        position_ids = kwargs.get("position_ids")
+        attention_mask = kwargs.get("attention_mask")
+
+        if attention_mask is not None and position_ids is None:
+            position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
+            if past_key_values:
+                position_ids = tf.expand_dims(position_ids[:, -1], -1)
+
+        return {
+            "input_ids": inputs,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+            "token_type_ids": token_type_ids,
+        }
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFCausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFCausalLMOutputWithCrossAttentions | tuple[tf.Tensor]:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have
+            their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past`). Set to `False` during training, `True` during generation
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = tf.matmul(hidden_states, self.transformer.wte.weights, transpose_b=True)
+
+        loss = None
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            shifted_logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.hf_compute_loss(labels, shifted_logits)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFCausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+
+
+@add_start_docstrings(
+    """
+    The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
+    RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
+    input embeddings, the classification head takes as input the input of a specified classification token index in the
+    input sequence).
+    """,
+    GPT2_START_DOCSTRING,
+)
+class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        config.num_labels = 1
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
+        self.multiple_choice_head = TFSequenceSummary(
+            config, initializer_range=config.initializer_range, name="multiple_choice_head"
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFGPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        mc_token_ids: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFGPT2DoubleHeadsModelOutput | tuple[tf.Tensor]:
+        r"""
+        mc_token_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, num_choices)`, *optional*, default to index of the last token of the input):
+            Index of the classification token in each input sequence. Selected in the range `[0, input_ids.size(-1) -
+            1]`.
+
+        Return:
+
+        Examples:
+
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import AutoTokenizer, TFGPT2DoubleHeadsModel
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        >>> model = TFGPT2DoubleHeadsModel.from_pretrained("openai-community/gpt2")
+
+        >>> # Add a [CLS] to the vocabulary (we should train it also!)
+        >>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
+
+        >>> embedding_layer = model.resize_token_embeddings(
+        ...     len(tokenizer)
+        ... )  # Update the model embeddings with the new vocabulary size
+
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+        >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+        >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+
+        >>> input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
+        >>> mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1
+
+        >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        ```"""
+
+        if input_ids is not None:
+            input_shapes = shape_list(input_ids)
+        else:
+            input_shapes = shape_list(inputs_embeds)[:-1]
+
+        seq_length = input_shapes[-1]
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+        transformer_outputs = self.transformer(
+            input_ids=flat_input_ids,
+            past_key_values=past_key_values,
+            attention_mask=flat_attention_mask,
+            token_type_ids=flat_token_type_ids,
+            position_ids=flat_position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = transformer_outputs[0]
+        hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:])
+        if return_dict and output_hidden_states:
+            # We do this to match the slightly odd PT behaviour - the final hidden state is reshaped to rank 4 when the
+            # input is rank 3, but all other hidden states remain at rank-3 (with the first 2 dims merged)
+            all_hidden_states = transformer_outputs.hidden_states[:-1] + (hidden_states,)
+        else:
+            all_hidden_states = None
+        lm_logits = tf.matmul(hidden_states, self.transformer.wte.weights, transpose_b=True)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids, training=training)
+        mc_logits = tf.squeeze(mc_logits, axis=-1)
+
+        if not return_dict:
+            return (lm_logits, mc_logits) + transformer_outputs[1:]
+
+        return TFGPT2DoubleHeadsModelOutput(
+            logits=lm_logits,
+            mc_logits=mc_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @property
+    def input_signature(self):
+        return {
+            "input_ids": tf.TensorSpec((None, None, None), tf.int32, name="input_ids"),
+            "attention_mask": tf.TensorSpec((None, None, None), tf.int32, name="attention_mask"),
+            "mc_token_ids": tf.TensorSpec((None, None), tf.int32, name="mc_token_ids"),
+        }
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "multiple_choice_head", None) is not None:
+            with tf.name_scope(self.multiple_choice_head.name):
+                self.multiple_choice_head.build(None)
+
+
+@add_start_docstrings(
+    """
+    The GPT2 Model transformer with a sequence classification head on top (linear layer).
+
+    [`TFGPT2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    GPT2_START_DOCSTRING,
+)
+class TFGPT2ForSequenceClassification(TFGPT2PreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.score = keras.layers.Dense(
+            config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="score",
+            use_bias=False,
+        )
+        self.transformer = TFGPT2MainLayer(config, name="transformer")
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint="microsoft/DialogRPT-updown",
+        output_type=TFSequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFSequenceClassifierOutputWithPast | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        logits_shape = shape_list(logits)
+        batch_size = logits_shape[0]
+
+        if self.config.pad_token_id is None:
+            last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1)
+        else:
+            if input_ids is not None:
+                token_indices = tf.range(shape_list(input_ids)[-1])
+                non_pad_mask = tf.cast(input_ids != self.config.pad_token_id, token_indices.dtype)
+                last_non_pad_token = tf.reduce_max(token_indices * non_pad_mask, axis=-1)
+            else:
+                last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1)
+                logger.warning_once(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+        loss = None
+
+        pooled_logits = tf.gather(logits, last_non_pad_token, batch_dims=1, axis=1)
+
+        if labels is not None:
+            if self.config.pad_token_id is None and logits_shape[0] != 1:
+                raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+
+            loss = self.hf_compute_loss(tf.reshape(labels, [-1]), tf.reshape(pooled_logits, [-1, self.num_labels]))
+
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "score", None) is not None:
+            with tf.name_scope(self.score.name):
+                self.score.build([None, None, self.config.n_embd])
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+
+
+__all__ = [
+    "TFGPT2DoubleHeadsModel",
+    "TFGPT2ForSequenceClassification",
+    "TFGPT2LMHeadModel",
+    "TFGPT2MainLayer",
+    "TFGPT2Model",
+    "TFGPT2PreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..608164ef2d83ab15bf7f99d33f9c6eb56ed1fcff
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2.py
@@ -0,0 +1,334 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+
+import json
+import os
+from functools import lru_cache
+from typing import Optional
+
+import regex as re
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+
+@lru_cache
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class GPT2Tokenizer(PreTrainedTokenizer):
+    """
+    Construct a GPT-2 tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ```python
+    >>> from transformers import GPT2Tokenizer
+
+    >>> tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
+    >>> tokenizer("Hello world")["input_ids"]
+    [15496, 995]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [18435, 995]
+    ```
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    <Tip>
+
+    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
+
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*):
+            The token used for padding, for example when batching sequences of different lengths.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (GPT2 tokenizer detect beginning of words by the preceding space).
+        add_bos_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial beginning of sentence token to the input. This allows to treat the leading
+            word just as any other word.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        pad_token=None,
+        add_prefix_space=False,
+        add_bos_token=False,
+        **kwargs,
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        self.add_bos_token = add_bos_token
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+
+        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+        super().__init__(
+            errors=errors,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            add_prefix_space=add_prefix_space,
+            add_bos_token=add_bos_token,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+
+        output = bos_token_ids + token_ids_0
+
+        if token_ids_1 is None:
+            return output
+
+        return output + bos_token_ids + token_ids_1
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if not self.add_bos_token:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=False
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0))
+        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if is_split_into_words or add_prefix_space:
+            text = " " + text
+        return (text, kwargs)
+
+
+__all__ = ["GPT2Tokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..f81c155e864476cf49c24f91a0235c939f42d3e0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2_fast.py
@@ -0,0 +1,133 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+
+from typing import Optional
+
+from ...tokenization_utils_base import BatchEncoding
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+from .tokenization_gpt2 import GPT2Tokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class GPT2TokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
+    Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ```python
+    >>> from transformers import GPT2TokenizerFast
+
+    >>> tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+    >>> tokenizer("Hello world")["input_ids"]
+    [15496, 995]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [18435, 995]
+    ```
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
+    the model was not pretrained this way, it might yield a decrease in performance.
+
+    <Tip>
+
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
+
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`, *optional*):
+            Path to the vocabulary file.
+        merges_file (`str`, *optional*):
+            Path to the merges file.
+        tokenizer_file (`str`, *optional*):
+            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
+            contains everything needed to load the tokenizer.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (GPT2 tokenizer detect beginning of words by the preceding space).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = GPT2Tokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        tokenizer_file=None,
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        add_prefix_space=False,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            tokenizer_file=tokenizer_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
+        self.add_bos_token = kwargs.pop("add_bos_token", False)
+
+    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+        assert self.add_prefix_space or not is_split_into_words, (
+            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+            "to use it with pretokenized inputs."
+        )
+
+        return super()._batch_encode_plus(*args, **kwargs)
+
+    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+
+        assert self.add_prefix_space or not is_split_into_words, (
+            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+            "to use it with pretokenized inputs."
+        )
+
+        return super()._encode_plus(*args, **kwargs)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+
+__all__ = ["GPT2TokenizerFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2_tf.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2_tf.py
new file mode 100644
index 0000000000000000000000000000000000000000..145a45da0db6d36f75f5cec6091027e36541184e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/tokenization_gpt2_tf.py
@@ -0,0 +1,119 @@
+import os
+from typing import Optional, Union
+
+import tensorflow as tf
+from tensorflow_text import pad_model_inputs
+
+from ...modeling_tf_utils import keras
+from ...utils.import_utils import is_keras_nlp_available, requires
+from .tokenization_gpt2 import GPT2Tokenizer
+
+
+if is_keras_nlp_available():
+    from keras_nlp.tokenizers import BytePairTokenizer
+
+
+@requires(backends=("keras_nlp",))
+class TFGPT2Tokenizer(keras.layers.Layer):
+    """
+    This is an in-graph tokenizer for GPT2. It should be initialized similarly to other tokenizers, using the
+    `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings
+    from an existing standard tokenizer object.
+
+    In-graph tokenizers, unlike other Hugging Face tokenizers, are actually Keras layers and are designed to be run
+    when the model is called, rather than during preprocessing. As a result, they have somewhat more limited options
+    than standard tokenizer classes. They are most useful when you want to create an end-to-end model that goes
+    straight from `tf.string` inputs to outputs.
+
+    Args:
+        vocab (dict[str, int]): Vocabulary dict for Byte Pair Tokenizer
+        merges (list[str]): Merges list for Byte Pair Tokenizer
+    """
+
+    def __init__(
+        self,
+        vocab: dict[str, int],
+        merges: list[str],
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+    ):
+        super().__init__()
+        self.pad_token_id = pad_token_id
+        self.max_length = max_length
+        self.vocab = vocab
+        self.merges = merges
+
+        self.tf_tokenizer = BytePairTokenizer(vocab, merges, sequence_length=max_length)
+
+    @classmethod
+    def from_tokenizer(cls, tokenizer: GPT2Tokenizer, *args, **kwargs):
+        """Creates TFGPT2Tokenizer from GPT2Tokenizer
+
+        Args:
+            tokenizer (GPT2Tokenizer)
+
+        Examples:
+
+        ```python
+        from transformers import AutoTokenizer, TFGPT2Tokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        tf_tokenizer = TFGPT2Tokenizer.from_tokenizer(tokenizer)
+        ```
+        """
+        merges = [" ".join(m) for m in tokenizer.bpe_ranks]
+        vocab = tokenizer.get_vocab()
+        return cls(vocab, merges, *args, **kwargs)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs):
+        """Creates TFGPT2Tokenizer from pretrained GPT2Tokenizer
+
+        Args:
+            pretrained_model_name_or_path (Union[str, os.PathLike]): Path to pretrained model
+
+        Examples:
+
+        ```python
+        from transformers import TFGPT2Tokenizer
+
+        tf_tokenizer = TFGPT2Tokenizer.from_pretrained("openai-community/gpt2")
+        ```
+        """
+        tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
+        return cls.from_tokenizer(tokenizer, *init_inputs, **kwargs)
+
+    @classmethod
+    def from_config(cls, config):
+        """Creates TFGPT2Tokenizer from configurations
+
+        Args:
+            config (Dict): Dictionary with keys such as stated in `get_config`.
+        """
+        return cls(**config)
+
+    def get_config(self):
+        return {
+            "vocab": self.vocab,
+            "merges": self.merges,
+            "max_length": self.max_length,
+            "pad_token_id": self.pad_token_id,
+        }
+
+    def call(self, x, max_length: Optional[int] = None):
+        input_ids = self.tf_tokenizer(x)
+        attention_mask = tf.ones_like(input_ids)
+
+        if self.pad_token_id is not None:
+            # pad the tokens up to max length
+            max_length = max_length if max_length is not None else self.max_length
+
+            if max_length is not None:
+                input_ids, attention_mask = pad_model_inputs(
+                    input_ids, max_seq_length=max_length, pad_value=self.pad_token_id
+                )
+
+        return {"attention_mask": attention_mask, "input_ids": input_ids}
+
+
+__all__ = ["TFGPT2Tokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..94ba39d69ad638c706f6ac8491e2dea80e269929
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gpt_neox_japanese import *
+    from .modeling_gpt_neox_japanese import *
+    from .tokenization_gpt_neox_japanese import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
new file mode 100644
index 0000000000000000000000000000000000000000..320157334539b1d7c418c8cf97c8b57dc38629f7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py
@@ -0,0 +1,167 @@
+# coding=utf-8
+# Copyright 2022 ABEJA, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPTNeoX Japanese model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GPTNeoXJapaneseConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GPTNeoXModelJapanese`]. It is used to instantiate
+    a GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GPTNeoXJapanese
+    [abeja/gpt-neox-japanese-2.7b](https://huggingface.co/abeja/gpt-neox-japanese-2.7b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information. Default configs is set as 2.7B model
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the GPTNeoXJapanese model. Defines the number of different tokens that can be
+            represented by the `inputs_ids` passed when calling [`GPTNeoXJapanese`].
+        hidden_size (`int`, *optional*, defaults to 2560):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_multiple_size (`int`, *optional*, defaults to 4):
+            Dimension of the "intermediate" layer in the Transformer encoder is calculated by hidden_size *
+            intermediate_multiple_size.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
+        rotary_pct (`float`, *optional*, defaults to 1.00):
+            percentage of hidden dimensions to allocate to rotary embeddings
+        rotary_emb_base (`int`, *optional*, defaults to 10000)
+            base for computing rotary embeddings frequency
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the hidden layer.
+        Example:
+
+    ```python
+    >>> from transformers import GPTNeoXJapaneseConfig, GPTNeoXJapaneseModel
+
+    >>> # Initializing a GPTNeoXJapanese gpt-neox-japanese-2.7b style configuration
+    >>> configuration = GPTNeoXJapaneseConfig()
+
+    >>> # Initializing a model (with random weights) from the gpt-neox-japanese-2.7b style configuration
+    >>> model = GPTNeoXJapaneseModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "gpt_neox_japanese"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=2560,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        intermediate_multiple_size=4,
+        hidden_act="gelu",
+        rotary_pct=1.00,
+        rotary_emb_base=10000,
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        bos_token_id=31996,
+        eos_token_id=31999,
+        rope_scaling=None,
+        attention_dropout=0.1,
+        hidden_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_multiple_size = intermediate_multiple_size
+        self.hidden_act = hidden_act
+        self.rotary_pct = rotary_pct
+        self.partial_rotary_factor = rotary_pct
+        self.rotary_emb_base = rotary_emb_base
+        self.rope_theta = rotary_emb_base
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+
+__all__ = ["GPTNeoXJapaneseConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
new file mode 100644
index 0000000000000000000000000000000000000000..70399f376c7553fc5d7a1437b0a4b760732697ba
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -0,0 +1,755 @@
+# coding=utf-8
+# Copyright 2022 ABEJA, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch GPTNeoX model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, is_torch_flex_attn_available, logging
+from .configuration_gpt_neox_japanese import GPTNeoXJapaneseConfig
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+@auto_docstring
+class GPTNeoXJapanesePreTrainedModel(PreTrainedModel):
+    config: GPTNeoXJapaneseConfig
+    base_model_prefix = "gpt_neox_japanese"
+    _no_split_modules = ["GPTNeoXJapaneseLayer"]
+    _skip_keys_device_placement = "past_key_values"
+
+    _can_compile_fullgraph = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, GPTNeoXJapaneseAttention):
+            if module.dense_bias is not None:
+                module.dense_bias.data.zero_()
+
+
+class GPTNeoXJapaneseAttention(nn.Module):
+    def __init__(self, config, use_bias=False, layer_idx=None):
+        super().__init__()
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.num_attention_heads
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.layer_idx = layer_idx
+        self.rotary_ndims = int(self.head_size * config.rotary_pct)
+        self.rope_theta = config.rotary_emb_base
+        self.rotary_emb = GPTNeoXJapaneseRotaryEmbedding(config=config)
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+        self.norm_factor = math.sqrt(self.head_size)
+
+        self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=False)
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        # Activate bias if the last layer
+        self.use_bias = use_bias
+        self.dense_bias = nn.Parameter(torch.zeros(config.hidden_size)) if use_bias else None
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        position_ids: torch.LongTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        layer_past: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ):
+        # Compute QKV
+        # Attention heads [batch, seq_len, hidden_size]
+        #   --> [batch, seq_len, (np * 3 * head_size)]
+        qkv = self.query_key_value(hidden_states)
+
+        # [batch, seq_len, (num_heads * 3 * head_size)]
+        #   --> [batch, seq_len, num_heads, 3 * head_size]
+        new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
+        qkv = qkv.view(*new_qkv_shape)
+
+        # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size]
+        query = qkv[..., : self.head_size].permute(0, 2, 1, 3)
+        key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3)
+        value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3)
+
+        # Compute rotary embeddings on rotary_ndims
+        query_rot = query[..., : self.rotary_ndims]
+        query_pass = query[..., self.rotary_ndims :]
+        key_rot = key[..., : self.rotary_ndims]
+        key_pass = key[..., self.rotary_ndims :]
+
+        cos, sin = position_embeddings
+        query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin)
+        query = torch.cat((query, query_pass), dim=-1).contiguous()
+        key = torch.cat((key, key_pass), dim=-1).contiguous()
+
+        # Cache QKV values
+        if layer_past is not None:
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "partial_rotation_size": self.rotary_ndims,
+                "cache_position": cache_position,
+            }
+            key, value = layer_past.update(key, value, self.layer_idx, cache_kwargs)
+
+        # Compute attention
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        # Reshape outputs
+        attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size)
+        attn_output = self.dense(attn_output)
+
+        return attn_output, attn_weights, self.dense_bias
+
+    @classmethod
+    def _split_heads(cls, tensor, num_attention_heads, attn_head_size):
+        """
+        Splits hidden dim into attn_head_size and num_attention_heads
+        """
+        # tensor: [bs, seq_len, hidden_size]
+        new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size)
+        # -> [bs, seq_len, num_attention_heads, attn_head_size]
+        tensor = tensor.view(new_shape)
+        # -> [bs, num_attention_heads, seq_len, attn_head_size]
+        tensor = tensor.permute(0, 2, 1, 3)
+        return tensor
+
+    @classmethod
+    def _merge_heads(cls, tensor, num_attention_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden dim
+        """
+        # tensor [bs, num_attention_heads, seq_len, attn_head_size]
+        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        # -> [bs, seq_len, num_attention_heads, attn_head_size]
+        tensor = tensor.view(tensor.size(0), tensor.size(1), num_attention_heads * attn_head_size)
+        # -> [bs, seq_len, hidden_size]
+        return tensor
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
+        # compute causal mask from causal mask buffer
+        batch_size, num_attention_heads, query_length, attn_head_size = query.size()
+        key_length = key.size(-2)
+
+        query = query.view(batch_size * num_attention_heads, query_length, attn_head_size)
+        key = key.view(batch_size * num_attention_heads, key_length, attn_head_size)
+
+        # [batch_size * num_heads, q_length, kv_length]
+        attn_scores = torch.zeros(
+            batch_size * num_attention_heads,
+            query_length,
+            key_length,
+            dtype=query.dtype,
+            device=key.device,
+        )
+        attention_scores = torch.baddbmm(
+            attn_scores,
+            query,
+            key.transpose(1, 2),
+            beta=1.0,
+            alpha=1.0 / self.norm_factor,
+        )
+
+        attention_scores = attention_scores.view(batch_size, num_attention_heads, query_length, -1)
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+            attention_scores = attention_scores + causal_mask
+
+        attn_weights = nn.functional.softmax(attention_scores, dim=-1)
+        attn_weights = self.attention_dropout(attn_weights)
+        attn_weights = attn_weights.to(value.dtype)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXRotaryEmbedding with GPTNeoX->GPTNeoXJapanese
+class GPTNeoXJapaneseRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: GPTNeoXJapaneseConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def bias_dropout_add(x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool) -> Tensor:
+    """add bias to x, apply dropout and residual connection
+
+    Args:
+        x (Tensor): main path of output
+        bias (Tensor): None or attn_bias of the last attention layer
+        residual (Optional[Tensor]): residual value
+        prob (float): dropout probability
+        training (bool): whether in training mode or not
+
+    Returns:
+        Tensor: dropout(x + bias) + residual
+    """
+    if bias is not None:
+        x = x + bias
+    out = torch.nn.functional.dropout(x, p=prob, training=training)
+    if residual is not None:
+        out = residual + out
+    return out
+
+
+class GPTNeoXJapaneseMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        intermediate_size = int(config.hidden_size * config.intermediate_multiple_size)
+        self.dense_h_to_4h = nn.Linear(config.hidden_size, intermediate_size, bias=False)
+        # Project back to h.
+        self.dense_4h_to_h = nn.Linear(intermediate_size, config.hidden_size, bias=False)
+        self.act = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        intermediate = self.dense_h_to_4h(hidden_states)
+        intermediate = self.act(intermediate)
+        output = self.dense_4h_to_h(intermediate)
+        return output
+
+
+class GPTNeoXJapaneseLayer(nn.Module):
+    def __init__(self, config, layer_number):
+        super().__init__()
+        self.layer_number = layer_number
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # activate bias only last layer
+        self.attention = GPTNeoXJapaneseAttention(
+            config=config, use_bias=layer_number == config.num_hidden_layers - 1, layer_idx=layer_number
+        )
+        self.mlp = GPTNeoXJapaneseMLP(config)
+        self.hidden_dropout = config.hidden_dropout
+
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor],
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        layer_past: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+    ):
+        residual = hidden_states
+        ln_out = self.input_layernorm(hidden_states)
+        attn_output, attn_weights, attn_bias = self.attention(
+            ln_out,
+            attention_mask=attention_mask,
+            layer_past=layer_past,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+        )
+
+        # attn_output = (atten_output + bias) + residual
+        attn_output = bias_dropout_add(
+            attn_output,
+            bias=attn_bias.expand_as(residual) if attn_bias is not None else attn_bias,
+            residual=residual,
+            prob=self.hidden_dropout,
+            training=self.training,
+        )
+        mlp_output = self.mlp(self.post_attention_layernorm(attn_output))
+
+        # attn_output = (mlp_output + mlp_bias) + atten_output
+        attn_output = bias_dropout_add(
+            mlp_output, bias=None, residual=attn_output, prob=self.hidden_dropout, training=self.training
+        )
+
+        return attn_output, attn_weights
+
+
+@auto_docstring
+class GPTNeoXJapaneseModel(GPTNeoXJapanesePreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList(
+            [GPTNeoXJapaneseLayer(config=config, layer_number=i) for i in range(config.num_hidden_layers)]
+        )
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.rotary_emb = GPTNeoXJapaneseRotaryEmbedding(config=config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_in
+
+    def set_input_embeddings(self, value):
+        self.embed_in = value
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.FloatTensor]]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GPTNeoXJapaneseModel
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+        >>> model = GPTNeoXJapaneseModel.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+
+        >>> inputs = tokenizer("日本語のGPT-neoxがHugging Faceで使えます😀", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_in(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        seq_length = inputs_embeds.shape[1]
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=inputs_embeds.device)
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                head_mask=head_mask[i],
+                layer_past=past_key_values,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+            hidden_states = outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (outputs[1],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, past_key_values, all_hidden_states, all_attentions] if v is not None
+            )
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+@auto_docstring(
+    custom_intro="""
+    GPTNeoXJapanese Model with a `language modeling` head on top for Classifier Model fine-tuning.
+    """
+)
+class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["embed_out.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.gpt_neox_japanese = GPTNeoXJapaneseModel(config)
+        self.embed_out = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.embed_out
+
+    def set_output_embeddings(self, new_embeddings):
+        self.embed_out = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.FloatTensor]]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GPTNeoXJapaneseForCausalLM, GPTNeoXJapaneseConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+        >>> config = GPTNeoXJapaneseConfig.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+        >>> config.is_decoder = True
+        >>> model = GPTNeoXJapaneseForCausalLM.from_pretrained("abeja/gpt-neox-japanese-2.7b", config=config)
+
+        >>> inputs = tokenizer("日本語のGPT-neoxがHugging Faceで使えます😀", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.gpt_neox_japanese(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs[0]
+        lm_logits = self.embed_out(hidden_states)
+
+        lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+
+            lm_loss = self.loss_function(
+                lm_logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "GPTNeoXJapaneseForCausalLM",
+    "GPTNeoXJapaneseLayer",
+    "GPTNeoXJapaneseModel",
+    "GPTNeoXJapanesePreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
new file mode 100644
index 0000000000000000000000000000000000000000..584e74a8123e7cfdf31c4738a656a8417085e9a1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
@@ -0,0 +1,369 @@
+# coding=utf-8
+# Copyright 2022 ABEJA, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for GPTNeoXJapanese."""
+
+import collections
+import json
+import os
+import re
+import sys
+from typing import Optional
+
+import numpy as np
+
+from ...tokenization_utils_fast import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "emoji_file": "emoji.json"}
+
+
+def load_vocab_and_emoji(vocab_file, emoji_file):
+    """Loads a vocabulary file and emoji file into a dictionary."""
+    with open(emoji_file, "r", encoding="utf-8") as f:
+        emoji = json.loads(f.read())
+
+    vocab = collections.OrderedDict()
+    raw_vocab = collections.OrderedDict()
+    ids_to_tokens = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as f:
+        token = f.readlines()
+    token = [[t.rstrip("\n")] if (t == "," or "," not in t) else t.rstrip("\n").split(",") for t in token]
+    for idx, b in enumerate(token):
+        ids_to_tokens[idx] = b
+        raw_vocab[",".join(b)] = idx
+        for wd in b:
+            vocab[wd] = idx
+
+    return vocab, raw_vocab, ids_to_tokens, emoji
+
+
+class GPTNeoXJapaneseTokenizer(PreTrainedTokenizer):
+    """
+    This tokenizer inherits from [`PreTrainedTokenizer`] and is based on Japanese special Sub-Word-Encoding that is
+    used in this repository (https://github.com/tanreinama/Japanese-BPEEncoder_V2). Check the repository for details.
+    Japanese has a relatively large vocabulary and there is no separation between words. Furthermore, the language is a
+    combination of hiragana, katakana, and kanji, and variants such as "1" and "①" are often used. In order to cope
+    with these, this tokenizer has the following features
+    - Subword-by-subword segmentation, which is intermediate between byte strings and morphological analysis.
+    - BPEs are created for each Kanji, Hiragana, and Katakana character, and there are no BPEs that cross character
+        types, such as Kanji + Hiragana or Hiragana + Katakana.
+    - All-byte encoding that does not require <unk>.
+    - Independent of UTF codes such as 2-byte and 3-byte characters
+    - Conversion of heterographs to the same token_id
+    - Emoji and Emoticon are grouped into 12 types as special tags.
+
+    Example:
+
+    ```python
+    >>> from transformers import GPTNeoXJapaneseTokenizer
+
+    >>> tokenizer = GPTNeoXJapaneseTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b")
+    >>> # You can confirm both 慶応 and 慶應 are encoded to 17749
+    >>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]
+    [30014, 26883, 26638, 27228, 25, 26650, 31732, 31679, 27809, 26638, 17749, 31592, 17749, 31593, 321, 1281]
+
+    >>> # Both 慶応 and 慶應 are decoded to 慶応
+    >>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"])
+    '吾輩は猫である🐯。実は慶応(慶応)大学出身'
+    ```
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        emoji_file (`str`):
+            File containing the emoji.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The token used for padding
+        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        do_clean_text (`bool`, *optional*, defaults to `False`):
+            Whether or not to clean text for URL, EMAIL, TEL, Japanese DATE and Japanese PRICE.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        emoji_file,
+        unk_token="<|endoftext|>",
+        pad_token="<|endoftext|>",
+        bos_token="<|startoftext|>",
+        eos_token="<|endoftext|>",
+        do_clean_text=False,
+        **kwargs,
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = GPTNeoXJapaneseokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        if not os.path.isfile(emoji_file):
+            raise ValueError(
+                f"Can't find a emoji file at path '{emoji_file}'. To load the emoji information from a Google"
+                " pretrained model use `tokenizer = GPTNeoXJapaneseokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.do_clean_text = do_clean_text
+        self.vocab, self.raw_vocab, self.ids_to_tokens, self.emoji = load_vocab_and_emoji(vocab_file, emoji_file)
+        self.subword_tokenizer = SubWordJapaneseTokenizer(
+            vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
+        )
+        super().__init__(
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            do_clean_text=do_clean_text,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        # self.vocab contains support for character fluctuation unique to Japanese, and has a large number of vocab
+        return len(self.raw_vocab)
+
+    def get_vocab(self):
+        return dict(self.raw_vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.subword_tokenizer.convert_id_to_token(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = "".join(tokens).strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+            emoji_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["emoji_file"]
+            )
+        else:
+            vocab_file = (
+                (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["vocab_file"]
+            )
+            emoji_file = (
+                (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["emoji_file"]
+            )
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token_index, token in self.ids_to_tokens.items():
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(",".join(token) + "\n")
+                index += 1
+        with open(emoji_file, "w", encoding="utf-8") as writer:
+            json.dump(self.emoji, writer)
+        return vocab_file, emoji_file
+
+
+class SubWordJapaneseTokenizer:
+    """
+    https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT License according to the
+    original repository.
+
+    MIT License
+
+    Copyright (c) 2020 tanreinama
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+    documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all copies or substantial portions of
+    the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+    THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE.
+    """
+
+    def __init__(self, vocab, ids_to_tokens, emoji):
+        self.vocab = vocab  # same as swe
+        self.ids_to_tokens = ids_to_tokens  # same as bpe
+        self.emoji = emoji
+        self.maxlen = np.max([len(w) for w in self.vocab])
+        self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)")
+        self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\-_0-9A-Za-z]+(\.[A-Za-z]+)*")
+        self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}")
+        self.content_repatter4 = re.compile(
+            r"([12]\d{3}[/\-年])*(0?[1-9]|1[0-2])[/\-月]((0?[1-9]|[12][0-9]|3[01])日?)*(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
+        )
+        self.content_repatter5 = re.compile(
+            r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
+        )
+        # The original version of this regex displays catastrophic backtracking behaviour. We avoid this using
+        # possessive quantifiers in Py >= 3.11. In versions below this, we avoid the vulnerability using a slightly
+        # different regex that should generally have the same behaviour in most non-pathological cases.
+        if sys.version_info >= (3, 11):
+            self.content_repatter6 = re.compile(
+                r"(?:\d,\d{3}|[\d億])*+"
+                r"(?:\d,\d{3}|[\d万])*+"
+                r"(?:\d,\d{3}|[\d千])*+"
+                r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
+                r"(?:\(税込\)|\(税抜\)|\+tax)*"
+            )
+        else:
+            self.content_repatter6 = re.compile(
+                r"(?:\d,\d{3}|[\d億万千])*"
+                r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
+                r"(?:\(税込\)|\(税抜\)|\+tax)*"
+            )
+        keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
+        blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
+        self.content_trans1 = str.maketrans(dict.fromkeys(keisen + blocks, "<BLOCK>"))
+
+    def __len__(self):
+        return len(self.ids_to_tokens)
+
+    def clean_text(self, content):
+        content = self.content_repatter1.sub("<URL>", content)
+        content = self.content_repatter2.sub("<EMAIL>", content)
+        content = self.content_repatter3.sub("<TEL>", content)
+        content = self.content_repatter4.sub("<DATE>", content)
+        content = self.content_repatter5.sub("<DATE>", content)
+        content = self.content_repatter6.sub("<PRICE>", content)
+        content = content.translate(self.content_trans1)
+        while "<BLOCK><BLOCK>" in content:
+            content = content.replace("<BLOCK><BLOCK>", "<BLOCK>")
+        return content
+
+    def tokenize(self, text, clean=False):
+        text = text.replace(" ", "<SP>")
+        text = text.replace("　", "<SP>")
+        text = text.replace("\r\n", "<BR>")
+        text = text.replace("\n", "<BR>")
+        text = text.replace("\r", "<BR>")
+        text = text.replace("\t", "<TAB>")
+        text = text.replace("—", "ー")
+        text = text.replace("−", "ー")
+        for k, v in self.emoji["emoji"].items():
+            if k in text:
+                text = text.replace(k, v)
+        if clean:
+            text = self.clean_text(text)
+
+        def check_simbol(x):
+            e = x.encode()
+            if len(x) == 1 and len(e) == 2:
+                c = (int(e[0]) << 8) + int(e[1])
+                if (
+                    (c >= 0xC2A1 and c <= 0xC2BF)
+                    or (c >= 0xC780 and c <= 0xC783)
+                    or (c >= 0xCAB9 and c <= 0xCBBF)
+                    or (c >= 0xCC80 and c <= 0xCDA2)
+                ):
+                    return True
+            return False
+
+        def checku2e(x):
+            e = x.encode()
+            if len(x) == 1 and len(e) == 3:
+                c = (int(e[0]) << 16) + (int(e[1]) << 8) + int(e[2])
+                if c >= 0xE28080 and c <= 0xE2B07F:
+                    return True
+            return False
+
+        pos = 0
+        result = []
+        while pos < len(text):
+            end = min(len(text), pos + self.maxlen + 1) if text[pos] == "<" else pos + 3
+            candidates = []  # (token_id, token, pos)
+            for e in range(end, pos, -1):
+                wd = text[pos:e]
+                if wd in self.vocab:
+                    if wd[0] == "<" and len(wd) > 2:
+                        candidates = [(self.vocab[wd], wd, e)]
+                        break
+                    else:
+                        candidates.append((self.vocab[wd], wd, e))
+            if len(candidates) > 0:
+                # the smallest token_id is adopted
+                _, wd, e = min(candidates, key=lambda x: x[0])
+                result.append(wd)
+                pos = e
+            else:
+                end = pos + 1
+                wd = text[pos:end]
+                if check_simbol(wd):
+                    result.append("<KIGOU>")
+                elif checku2e(wd):
+                    result.append("<U2000U2BFF>")
+                else:
+                    for i in wd.encode("utf-8"):
+                        result.append("<|byte%d|>" % i)
+                pos = end
+        return result
+
+    def convert_id_to_token(self, index, breakline="\n"):
+        words = []
+        byte_tokens = []
+        word = self.ids_to_tokens[index][0]
+        if word[:6] == "<|byte" and word[-2:] == "|>":
+            byte_tokens.append(int(word[6:-2]))
+        else:
+            if len(byte_tokens) > 0:
+                words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
+                byte_tokens = []
+            if word[:7] == "<|emoji" and word[-2:] == "|>":
+                words.append(self.emoji["emoji_inv"][word])
+            elif word == "<SP>":
+                words.append(" ")
+            elif word == "<BR>":
+                words.append(breakline)
+            elif word == "<TAB>":
+                words.append("\t")
+            elif word == "<BLOCK>":
+                words.append("▀")
+            elif word == "<KIGOU>":
+                words.append("ǀ")
+            elif word == "<U2000U2BFF>":
+                words.append("‖")
+            else:
+                words.append(word)
+        if len(byte_tokens) > 0:
+            words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
+        text = "".join(words)
+        return text
+
+
+__all__ = ["GPTNeoXJapaneseTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e54fff573bc3284897d78701cceed524b666f26
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/configuration_gpt_oss.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/configuration_gpt_oss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70df94b2f279f4aa8b1929346f0499bef5415919
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/configuration_gpt_oss.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/modeling_gpt_oss.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/modeling_gpt_oss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a38ad7b036115c6a8dfc8299d31067335308b5f1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/modeling_gpt_oss.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/modular_gpt_oss.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/modular_gpt_oss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58c328384d4f11d9d0f5b102fbfe7e721a34862f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_oss/__pycache__/modular_gpt_oss.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f186b307eda97787c0fa0ae3c996777695a8707
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/configuration_granite_speech.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/configuration_granite_speech.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..416f1aaa3d5ee2038677eae16f36bf2bd5c93de9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/configuration_granite_speech.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/feature_extraction_granite_speech.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/feature_extraction_granite_speech.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66c7da841533d71cc7210ec81e523b6ce15fe13a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/feature_extraction_granite_speech.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/modeling_granite_speech.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/modeling_granite_speech.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20524758bca5ac06c1ad7fc1b0512df1f9d494cd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/modeling_granite_speech.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/processing_granite_speech.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/processing_granite_speech.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1859f05413a8160947a731ca9b0e41bd012c9726
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite_speech/__pycache__/processing_granite_speech.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc3bb4c73c835739ddb032f15362540324528b86
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/configuration_granitemoe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/configuration_granitemoe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de5d111f021ba40a229a0980658f520022a8bc5b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/configuration_granitemoe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/modeling_granitemoe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/modeling_granitemoe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e54160a374472bac9f9d45755321d20c80feaec
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoe/__pycache__/modeling_granitemoe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ebcf89b12667ea2ab25a8d3db2c0616b3355c70
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/configuration_groupvit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/configuration_groupvit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61ad2d5d8e1045b78b0a5bfddf88db6dfee29413
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/configuration_groupvit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/modeling_groupvit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/modeling_groupvit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b3db93b295a3c4ee749b8b7d11a023228311a67
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/groupvit/__pycache__/modeling_groupvit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24a80c07bee62cc6258e87afc915ac14e404c44f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c9f03835c46ffc2934e9259148e90363a399db3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aacef232e012c0deb5d8c3c9b9c6abf457e87860
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/herbert/__pycache__/tokenization_herbert_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..557ac68e7afe69c6a124f6bd0754184ea2b6afc5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/configuration_ibert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/configuration_ibert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35f806a9831c3bf9f24e567241e3bf699e7dec01
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/configuration_ibert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/modeling_ibert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/modeling_ibert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2614fa62b5341ba29e46539288a293cc615e789a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/modeling_ibert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/quant_modules.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/quant_modules.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa264f3e6130b1b348f4ec1d3fd0034f2ee235eb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ibert/__pycache__/quant_modules.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fac78d81571e4b13017e49ce6699306a732c1804
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/configuration_idefics2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/configuration_idefics2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65ca021d2da2a20610a6bba549c15c82d7dafa3f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/configuration_idefics2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57da8394089340d4336b7674a76088e713d6d439
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ded1c7bc0aeae8833cf1b446f00d8296d490b286
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/image_processing_idefics2_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/modeling_idefics2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/modeling_idefics2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c506153a44cf06a3098563da5005d79f02b8bd52
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/modeling_idefics2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/processing_idefics2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/processing_idefics2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a14765a39e630128effef8bc0f3f9cc1e9292751
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics2/__pycache__/processing_idefics2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db6b919b86c92a734e0b7025b791f45060524990
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/configuration_idefics3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/configuration_idefics3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..339d263f96e816b7b81d93d36f184567e66226d0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/configuration_idefics3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5bab239e3ee6a52a5007dae8409ceebda112c465
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b8caa3956def119f07377bbf05da305a07bd532
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/modeling_idefics3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/modeling_idefics3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68c2a7670812d6052d3876034346c1d78d62af00
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/modeling_idefics3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/processing_idefics3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/processing_idefics3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cab0aa9516080207d9c57a2c97845509df57675
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics3/__pycache__/processing_idefics3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8923af1de116219405577646ae2dcedee5602ccc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_ijepa import *
+    from .modeling_ijepa import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/configuration_ijepa.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/configuration_ijepa.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f528adad0d55711f40ea21f58e1e0196822f449
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/configuration_ijepa.py
@@ -0,0 +1,121 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""I-JEPA model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+
+
+class IJepaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`IJepaModel`]. It is used to instantiate an IJEPA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the I-JEPA
+    [facebook/ijepa_vith14_1k](https://huggingface.co/facebook/ijepa_vith14_1k) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        pooler_output_size (`int`, *optional*):
+           Dimensionality of the pooler layer. If None, defaults to `hidden_size`.
+        pooler_act (`str`, *optional*, defaults to `"tanh"`):
+           The activation function to be used by the pooler. Keys of ACT2FN are supported for Flax and
+           Pytorch, and elements of https://www.tensorflow.org/api_docs/python/tf/keras/activations are
+           supported for Tensorflow.
+
+    Example:
+
+    ```python
+    >>> from transformers import IJepaConfig, IJepaModel
+
+    >>> # Initializing a IJEPA ijepa-base-patch16-224 style configuration
+    >>> configuration = IJepaConfig()
+
+    >>> # Initializing a model (with random weights) from the ijepa-base-patch16-224 style configuration
+    >>> model = IJepaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "ijepa"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        pooler_output_size=None,
+        pooler_act="tanh",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size
+        self.pooler_act = pooler_act
+
+
+__all__ = ["IJepaConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/modeling_ijepa.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/modeling_ijepa.py
new file mode 100644
index 0000000000000000000000000000000000000000..b26a78e49ab0e251cd5bdee256a01b8d9da61451
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/modeling_ijepa.py
@@ -0,0 +1,540 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/ijepa/modular_ijepa.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_ijepa.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import collections.abc
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import TransformersKwargs, auto_docstring, torch_int
+from ...utils.generic import can_return_tuple, check_model_inputs
+from .configuration_ijepa import IJepaConfig
+
+
+class IJepaPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+class IJepaEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+    """
+
+    def __init__(self, config: IJepaConfig, use_mask_token: bool = False) -> None:
+        super().__init__()
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
+        self.patch_embeddings = IJepaPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1]
+        num_positions = self.position_embeddings.shape[1]
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        patch_pos_embed = self.position_embeddings
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return patch_pos_embed
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        if bool_masked_pos is not None:
+            seq_length = embeddings.shape[1]
+            mask_tokens = self.mask_token.expand(batch_size, seq_length, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class IJepaSelfAttention(nn.Module):
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+    def forward(
+        self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size = hidden_states.shape[0]
+        new_shape = batch_size, -1, self.num_attention_heads, self.attention_head_size
+
+        key_layer = self.key(hidden_states).view(*new_shape).transpose(1, 2)
+        value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
+        query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+class IJepaSelfOutput(nn.Module):
+    """
+    The residual connection is defined in IJepaLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class IJepaAttention(nn.Module):
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        self.attention = IJepaSelfAttention(config)
+        self.output = IJepaSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: set[int]):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        self_attn_output, _ = self.attention(hidden_states, head_mask)
+        output = self.output(self_attn_output, hidden_states)
+        return output
+
+
+class IJepaIntermediate(nn.Module):
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class IJepaOutput(nn.Module):
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+
+
+class IJepaLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = IJepaAttention(config)
+        self.intermediate = IJepaIntermediate(config)
+        self.output = IJepaOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        hidden_states_norm = self.layernorm_before(hidden_states)
+        attention_output = self.attention(hidden_states_norm, head_mask)
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in IJepa, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        return layer_output
+
+
+@auto_docstring
+class IJepaPreTrainedModel(PreTrainedModel):
+    config: IJepaConfig
+    base_model_prefix = "ijepa"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["IJepaEmbeddings", "IJepaLayer"]
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": IJepaLayer,
+        "attentions": IJepaSelfAttention,
+    }
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, IJepaEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+
+
+class IJepaEncoder(nn.Module):
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([IJepaLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> BaseModelOutput:
+        for i, layer_module in enumerate(self.layer):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            hidden_states = layer_module(hidden_states, layer_head_mask)
+
+        return BaseModelOutput(last_hidden_state=hidden_states)
+
+
+class IJepaPooler(nn.Module):
+    def __init__(self, config: IJepaConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.pooler_output_size)
+        self.activation = ACT2FN[config.pooler_act]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@auto_docstring
+class IJepaModel(IJepaPreTrainedModel):
+    def __init__(self, config: IJepaConfig, add_pooling_layer: bool = False, use_mask_token: bool = False):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        use_mask_token (`bool`, *optional*, defaults to `False`):
+            Whether to use a mask token for masked image modeling.
+        """
+        super().__init__(config)
+        self.config = config
+        self.embeddings = IJepaEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = IJepaEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = IJepaPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> IJepaPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @check_model_inputs(tie_last_hidden_states=False)
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)
+        expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype
+        if pixel_values.dtype != expected_dtype:
+            pixel_values = pixel_values.to(expected_dtype)
+
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+
+        encoder_outputs: BaseModelOutput = self.encoder(embedding_output, head_mask=head_mask)
+
+        sequence_output = encoder_outputs.last_hidden_state
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        return BaseModelOutputWithPooling(last_hidden_state=sequence_output, pooler_output=pooled_output)
+
+
+@auto_docstring(
+    custom_intro="""
+    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
+    e.g. for ImageNet.
+
+    <Tip>
+
+        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
+        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
+        position embeddings to the higher resolution.
+
+    </Tip>
+    """
+)
+class IJepaForImageClassification(IJepaPreTrainedModel):
+    def __init__(self, config: IJepaConfig):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.ijepa = IJepaModel(config, add_pooling_layer=False)
+
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> ImageClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs: BaseModelOutputWithPooling = self.ijepa(
+            pixel_values,
+            head_mask=head_mask,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            **kwargs,
+        )
+        sequence_output = outputs.last_hidden_state
+        logits = self.classifier(sequence_output.mean(dim=1))
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config, **kwargs)
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["IJepaPreTrainedModel", "IJepaModel", "IJepaForImageClassification"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/modular_ijepa.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/modular_ijepa.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b8e6e152f3c115db4a1712895239467ea45e7df
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ijepa/modular_ijepa.py
@@ -0,0 +1,186 @@
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from transformers.models.ijepa.configuration_ijepa import IJepaConfig
+
+from ...modeling_outputs import BaseModelOutputWithPooling, ImageClassifierOutput
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, torch_int
+from ..vit.modeling_vit import ViTEmbeddings, ViTForImageClassification, ViTModel, ViTPreTrainedModel
+
+
+class IJepaEmbeddings(ViTEmbeddings):
+    def __init__(self, config: IJepaConfig, use_mask_token: bool = False) -> None:
+        super().__init__(config, use_mask_token)
+        # Remove cls_token from IJepaEmbeddings, as it is not used in the model
+        del self.cls_token
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches, config.hidden_size))
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1]
+        num_positions = self.position_embeddings.shape[1]
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        patch_pos_embed = self.position_embeddings
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return patch_pos_embed
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        if bool_masked_pos is not None:
+            seq_length = embeddings.shape[1]
+            mask_tokens = self.mask_token.expand(batch_size, seq_length, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+@auto_docstring
+class IJepaPreTrainedModel(ViTPreTrainedModel):
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, IJepaEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+            if module.mask_token is not None:
+                module.mask_token.data.zero_()
+
+
+class IJepaModel(IJepaPreTrainedModel, ViTModel):
+    def __init__(self, config: IJepaConfig, add_pooling_layer: bool = False, use_mask_token: bool = False):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        use_mask_token (`bool`, *optional*, defaults to `False`):
+            Whether to use a mask token for masked image modeling.
+        """
+        super().__init__(config)
+        self.config = config
+        self.embeddings = IJepaEmbeddings(config, use_mask_token=use_mask_token)
+
+
+@auto_docstring(
+    custom_intro="""
+    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
+    e.g. for ImageNet.
+
+    <Tip>
+
+        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
+        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
+        position embeddings to the higher resolution.
+
+    </Tip>
+    """
+)
+class IJepaForImageClassification(IJepaPreTrainedModel, ViTForImageClassification):
+    def __init__(self, config: IJepaConfig):
+        super().__init__(config)
+        self.ijepa = IJepaModel(config, add_pooling_layer=False)
+        self.post_init()
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> ImageClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs: BaseModelOutputWithPooling = self.ijepa(
+            pixel_values,
+            head_mask=head_mask,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            **kwargs,
+        )
+        sequence_output = outputs.last_hidden_state
+        logits = self.classifier(sequence_output.mean(dim=1))
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config, **kwargs)
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "IJepaPreTrainedModel",
+    "IJepaModel",
+    "IJepaForImageClassification",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f66a948f8f2ead454d7b12cd679da19215bf4fdf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/configuration_instructblip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/configuration_instructblip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37101c3567ec512a2477abdcb6d99fa34b358c91
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/configuration_instructblip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/modeling_instructblip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/modeling_instructblip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b11fb0bb4244097bbaaf32e65f1eaa5b56f7f2a8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/modeling_instructblip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/processing_instructblip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/processing_instructblip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e9257b8d9916e0f4b87d887889921fa83273ffd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblip/__pycache__/processing_instructblip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/processing_instructblipvideo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/processing_instructblipvideo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bda3110565ed727ca07758fd7eeec9ba1205bd7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/processing_instructblipvideo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd4b9d3c05b62b13db68052ec3607a447fb3adfd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/configuration_internvl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/configuration_internvl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92d1f8ba340619b00abe949f0caf11fbbdba8cd3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/configuration_internvl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/modeling_internvl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/modeling_internvl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc213ecbf133daa522a555788e80f91f00e86d94
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/modeling_internvl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/modular_internvl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/modular_internvl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e62821ffb638998431b500af746549ff95539b5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/modular_internvl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/processing_internvl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/processing_internvl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bb27772d551980591bc074eb618f415854011b0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/processing_internvl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/video_processing_internvl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/video_processing_internvl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c775c1c75b0b8ce59280d752b080107259fe5d1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/internvl/__pycache__/video_processing_internvl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/jamba/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/jamba/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..030a813a6a416ce09d307fad444eb21792b42bd6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/jamba/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3cffd5a21c593601a2dd49bfd60c803c25647a63
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/configuration_kosmos2_5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/configuration_kosmos2_5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78a40bca366192133e77ddc92c385f71578bc518
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/configuration_kosmos2_5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d624bfc614a3077a25ddbe48d9a25e940b722a09
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fcab2da3935780829be685743783540fd7de7c1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/image_processing_kosmos2_5_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/modeling_kosmos2_5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/modeling_kosmos2_5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b04b353e69f4f323a250e658bbe04236d863d68
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/modeling_kosmos2_5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/processing_kosmos2_5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/processing_kosmos2_5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1b46e3797f8f868aa8c6e89d03efd8f2873198e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kosmos2_5/__pycache__/processing_kosmos2_5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b5d4c46d23934b6301073e0f383da6bb86baaa4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/configuration_layoutlmv3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/configuration_layoutlmv3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c39c5a8e9ee6c96e4a056f8f82d0e6ce912250a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/configuration_layoutlmv3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/feature_extraction_layoutlmv3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/feature_extraction_layoutlmv3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e35699a59342b713e0b22c99ff970db80a603f9e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/feature_extraction_layoutlmv3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c18666149670cb32212696021032e2df06a84ae
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf3ae90ba8372247ffd6d7e19cc10e73e9c13866
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/image_processing_layoutlmv3_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_layoutlmv3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_layoutlmv3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff636bd8f995fcc21f2cc29848c7a6ce9b54aa44
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_layoutlmv3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_tf_layoutlmv3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_tf_layoutlmv3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8ec5f416037f1751a468d3a17d2a892c622c4b3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/modeling_tf_layoutlmv3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/processing_layoutlmv3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/processing_layoutlmv3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d409d6e492c72461e0c0da6769bb6a0f3df7cba1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/processing_layoutlmv3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b791531e3743248dd77066357d290ea7b4dbcdd3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6084c8a664470b0b9085a08ed202075a4f476b14
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/__pycache__/tokenization_layoutlmv3_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc4ba56803aca18a4f294f880b63c17927070fd8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/configuration_lilt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/configuration_lilt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47a4f57c798fd33591ae0845e01a11c262213d06
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/configuration_lilt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/modeling_lilt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/modeling_lilt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3efa785517cf84b9177c4c31184a42e10e9d1395
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lilt/__pycache__/modeling_lilt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31c7a8c60ab80dd82aaa50bc77a0dcf93de0539e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/modeling_flax_llama.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/modeling_flax_llama.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..378b5df98aa00b04a2c3adbec944274a5ee11f3c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/modeling_flax_llama.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/modeling_llama.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/modeling_llama.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..926aa533733f115f5df85ccc87f00593a1299022
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/modeling_llama.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/tokenization_llama.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/tokenization_llama.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..288c4d7dad3897a48567c216ab6b0eaa6a42e6ee
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/tokenization_llama.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/tokenization_llama_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/tokenization_llama_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a853afb0f4b3eb8a113d8922776285c72d10ea15
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama/__pycache__/tokenization_llama_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..773c2c62c2e3667fb65e09e01a4ad9a56fd4cb46
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/configuration_llama4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/configuration_llama4.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e5afdb17d5b319df8636cfb2a1886c3ab478bbb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/configuration_llama4.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/modeling_llama4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/modeling_llama4.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c8d7e77c6baef0f702f807bb807026d398c3898
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/modeling_llama4.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/processing_llama4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/processing_llama4.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d92fd8c800335898a162543e5cc763349038c94
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/processing_llama4.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c8429dc7e80c1ced93d7fa79b1b36d472eec26e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_llava_next import *
+    from .image_processing_llava_next import *
+    from .image_processing_llava_next_fast import *
+    from .modeling_llava_next import *
+    from .processing_llava_next import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/configuration_llava_next.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/configuration_llava_next.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ea71b1aa6421c8e2007cbb9399a69e9894288e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/configuration_llava_next.py
@@ -0,0 +1,150 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Llava-NeXT model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class LlavaNextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlavaNextForConditionalGeneration`]. It is used to instantiate an
+    Llava-NeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
+    model.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        image_token_index (`int`, *optional*, defaults to 32000):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
+            If `"full"`, the full vision features are used.
+        vision_feature_layer (`Union[int, list[int]]`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature. If multiple indices are provided,
+            the vision feature of the corresponding indices will be concatenated to form the
+            vision features.
+        image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`):
+            A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list
+            of the form `(height, width)`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
+        multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the multimodal projector.
+
+    Example:
+
+    ```python
+    >>> from transformers import LlavaNextForConditionalGeneration, LlavaNextConfig, CLIPVisionConfig, LlamaConfig
+
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = CLIPVisionConfig()
+
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+
+    >>> # Initializing a Llava-Next llava-hf/llava-v1.6-mistral-7b-hf style configuration
+    >>> configuration = LlavaNextConfig(vision_config, text_config)
+
+    >>> # Initializing a model from the llava-hf/llava-v1.6-mistral-7b-hf style configuration
+    >>> model = LlavaNextForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "llava_next"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+    }
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        image_token_index=32000,
+        projector_hidden_act="gelu",
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-2,
+        image_grid_pinpoints=None,
+        tie_word_embeddings=False,
+        image_seq_length=576,
+        multimodal_projector_bias=True,
+        **kwargs,
+    ):
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.image_seq_length = image_seq_length
+        self.multimodal_projector_bias = multimodal_projector_bias
+
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(
+                "vision_feature_select_strategy should be one of 'default', 'full'."
+                f"Got: {vision_feature_select_strategy}"
+            )
+
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+        image_grid_pinpoints = (
+            image_grid_pinpoints
+            if image_grid_pinpoints is not None
+            else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
+        )
+        self.image_grid_pinpoints = image_grid_pinpoints
+
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = vision_config.get("model_type", "clip_vision_model")
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+
+        self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "llama")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["llama"]()
+
+        self.text_config = text_config
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+__all__ = ["LlavaNextConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/image_processing_llava_next.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/image_processing_llava_next.py
new file mode 100644
index 0000000000000000000000000000000000000000..350ce9db7dc6f53e11090bf8f222665f8fcaf9a9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/image_processing_llava_next.py
@@ -0,0 +1,724 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for LLaVa-NeXT."""
+
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import (
+    BaseImageProcessor,
+    BatchFeature,
+    get_patch_output_size,
+    get_size_dict,
+    select_best_resolution,
+)
+from ...image_transforms import (
+    PaddingMode,
+    convert_to_rgb,
+    get_resize_output_image_size,
+    pad,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    from PIL import Image
+
+
+def divide_to_patches(image: np.ndarray, patch_size: int, input_data_format) -> list[np.ndarray]:
+    """
+    Divides an image into patches of a specified size.
+
+    Args:
+        image (`np.ndarray`):
+            The input image.
+        patch_size (`int`):
+            The size of each patch.
+        input_data_format (`ChannelDimension` or `str`):
+            The channel dimension format of the input image.
+
+    Returns:
+        list: A list of np.ndarray representing the patches.
+    """
+    patches = []
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            if input_data_format == ChannelDimension.LAST:
+                patch = image[i : i + patch_size, j : j + patch_size]
+            else:
+                patch = image[:, i : i + patch_size, j : j + patch_size]
+            patches.append(patch)
+
+    return patches
+
+
+def expand_to_square(image: np.ndarray, background_color, input_data_format) -> np.ndarray:
+    """
+    Expands an image to a square by adding a background color.
+    """
+
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    if width == height:
+        return image
+    elif width > height:
+        result = np.ones((width, width, image.shape[2]), dtype=image.dtype) * background_color
+        result[(width - height) // 2 : (width - height) // 2 + height, :] = image
+        return result
+    else:
+        result = np.ones((height, height, image.shape[2]), dtype=image.dtype) * background_color
+        result[:, (height - width) // 2 : (height - width) // 2 + width] = image
+        return result
+
+
+class LlavaNextImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a LLaVa-NeXT image processor. Based on [`CLIPImageProcessor`] with incorporation of additional techniques
+    for processing high resolution images as explained in the [LLaVa paper](https://huggingface.co/papers/2310.03744).
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        image_grid_pinpoints (`List` *optional*, defaults to `[[672, 336], [336, 672], [672, 672], [336, 1008], [1008, 336]]`):
+            A list of possible resolutions to use for processing high resolution images. The best resolution is selected
+            based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+                Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
+                number of patches in the batch. Padding will be applied to the bottom and right with zeros.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values", "image_sizes"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        image_grid_pinpoints: Optional[list] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_pad: Optional[bool] = True,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        image_grid_pinpoints = (
+            image_grid_pinpoints
+            if image_grid_pinpoints is not None
+            else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
+        )
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.image_grid_pinpoints = image_grid_pinpoints
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_pad = do_pad
+        self.do_convert_rgb = do_convert_rgb
+
+    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize with CLIP->LLaVa
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
+        output_size = get_resize_output_image_size(
+            image,
+            size=size,
+            default_to_square=default_to_square,
+            input_data_format=input_data_format,
+        )
+
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def pad(
+        self,
+        image: np.ndarray,
+        padding: Union[int, tuple[int, int], Iterable[tuple[int, int]]],
+        mode: PaddingMode = PaddingMode.CONSTANT,
+        constant_values: Union[float, Iterable[float]] = 0.0,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pads the `image` with the specified `padding` and `mode`. Padding can be in the (`height`, `width`)
+        dimension of in the (`num_patches`) dimension. In the second case an iterable if tuples is expected
+        as input.
+
+        Args:
+            image (`np.ndarray`):
+                The image to pad.
+            padding (`int` or `tuple[int, int]` or `Iterable[tuple[int, int]]`):
+                Padding to apply to the edges of the height, width axes. Can be one of three formats:
+                - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
+                - `((before, after),)` yields same before and after pad for height and width.
+                - `(pad,)` or int is a shortcut for before = after = pad width for all axes.
+            mode (`PaddingMode`):
+                The padding mode to use. Can be one of:
+                    - `"constant"`: pads with a constant value.
+                    - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
+                    vector along each axis.
+                    - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
+                    - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use the inferred format of the input image.
+
+        Returns:
+            `np.ndarray`: The padded image.
+
+        """
+
+        # call the general `pad` if padding on `height/width`, otherwise it's the `num_patched` dim
+        if isinstance(padding, int) or len(padding) != 4:
+            return pad(image, padding, mode, constant_values, data_format, input_data_format)
+
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+        if mode == PaddingMode.CONSTANT:
+            image = np.pad(image, padding, mode="constant", constant_values=constant_values)
+        elif mode == PaddingMode.REFLECT:
+            image = np.pad(image, padding, mode="reflect")
+        elif mode == PaddingMode.REPLICATE:
+            image = np.pad(image, padding, mode="edge")
+        elif mode == PaddingMode.SYMMETRIC:
+            image = np.pad(image, padding, mode="symmetric")
+        else:
+            raise ValueError(f"Invalid padding mode: {mode}")
+        image = (
+            to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
+        )
+        return image
+
+    def _preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> Image.Image:
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_flat_list_of_images(images)
+
+        all_images = []
+        for image in images:
+            if do_resize:
+                image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+            if do_center_crop:
+                image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            all_images.append(image)
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            for image in all_images
+        ]
+
+        return images
+
+    def _resize_for_patching(
+        self, image: np.ndarray, target_resolution: tuple, resample, input_data_format: ChannelDimension
+    ) -> np.ndarray:
+        """
+        Resizes an image to a target resolution while maintaining aspect ratio.
+
+        Args:
+            image (np.ndarray):
+                The input image.
+            target_resolution (tuple):
+                The target resolution (height, width) of the image.
+            resample (`PILImageResampling`):
+                Resampling filter to use if resizing the image.
+            input_data_format (`ChannelDimension` or `str`):
+                The channel dimension format of the input image.
+
+        Returns:
+            np.ndarray: The resized and padded image.
+        """
+        new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
+
+        # Resize the image
+        resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
+
+        return resized_image
+
+    def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple):
+        original_height, original_width = original_resolution
+        target_height, target_width = target_resolution
+        paste_x, r_x = divmod(target_width - original_width, 2)
+        paste_y, r_y = divmod(target_height - original_height, 2)
+        return (paste_y, paste_y + r_y), (paste_x, paste_x + r_x)
+
+    def _pad_for_patching(
+        self, image: np.ndarray, target_resolution: tuple, input_data_format: ChannelDimension
+    ) -> np.ndarray:
+        """
+        Pad an image to a target resolution while maintaining aspect ratio.
+        """
+        new_resolution = get_patch_output_size(image, target_resolution, input_data_format)
+        padding = self._get_padding_size(new_resolution, target_resolution)
+
+        padded_image = self.pad(image, padding=padding)
+
+        return padded_image
+
+    def get_image_patches(
+        self,
+        image: np.ndarray,
+        grid_pinpoints,
+        size: tuple,
+        patch_size: int,
+        resample: PILImageResampling,
+        data_format: ChannelDimension,
+        input_data_format: ChannelDimension,
+    ) -> list[np.ndarray]:
+        """
+        Process an image with variable resolutions by dividing it into patches.
+
+        Args:
+            image (np.ndarray):
+                The input image to be processed.
+            grid_pinpoints (List):
+                A string representation of a list of possible resolutions.
+            size (`tuple`):
+                Size to resize the original image to.
+            patch_size (`int`):
+                Size of the patches to divide the image into.
+            resample (`PILImageResampling`):
+                Resampling filter to use if resizing the image.
+            data_format (`ChannelDimension` or `str`):
+                The channel dimension format for the output image.
+            input_data_format (`ChannelDimension` or `str`):
+                The channel dimension format of the input image.
+
+        Returns:
+            list[np.ndarray]: A list of NumPy arrays containing the processed image patches.
+        """
+        if not isinstance(grid_pinpoints, list):
+            raise TypeError("grid_pinpoints must be a list of possible resolutions.")
+
+        possible_resolutions = grid_pinpoints
+
+        image_size = get_image_size(image, channel_dim=input_data_format)
+        best_resolution = select_best_resolution(image_size, possible_resolutions)
+        resized_image = self._resize_for_patching(
+            image, best_resolution, resample=resample, input_data_format=input_data_format
+        )
+        padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format)
+
+        patches = divide_to_patches(padded_image, patch_size=patch_size, input_data_format=input_data_format)
+
+        # make sure that all patches are in the input data format
+        patches = [
+            to_channel_dimension_format(patch, channel_dim=data_format, input_channel_dim=input_data_format)
+            for patch in patches
+        ]
+
+        resized_original_image = resize(
+            image,
+            size=size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+
+        image_patches = [resized_original_image] + patches
+
+        return image_patches
+
+    def _pad_for_batching(
+        self,
+        pixel_values: list[np.ndarray],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.
+
+        Args:
+            pixel_values (`list[np.ndarray]`):
+                An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`)
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use the inferred format of the input image.
+
+        Returns:
+            list[`np.ndarray`]: The padded images.
+        """
+        max_patch = max(len(x) for x in pixel_values)
+        pixel_values = [
+            self.pad(
+                image,
+                padding=((0, max_patch - image.shape[0]), (0, 0), (0, 0), (0, 0)),
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image in pixel_values
+        ]
+
+        return pixel_values
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        image_grid_pinpoints: Optional[list] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_pad: Optional[bool] = None,
+        do_convert_rgb: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            image_grid_pinpoints (`List` *optional*, defaults to `self.image_grid_pinpoints`):
+                A list of possible resolutions to use for processing high resolution images. The best resolution is
+                selected based on the original size of the image.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
+                number of patches in the batch. Padding will be applied to the bottom and right with zeros.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        image_grid_pinpoints = image_grid_pinpoints if image_grid_pinpoints is not None else self.image_grid_pinpoints
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        images = self.fetch_images(images)
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        processed_images = []
+        image_sizes = [get_image_size(image, channel_dim=input_data_format) for image in images]
+        for image in images:
+            # convert image into a list of patches
+            # we intentionally use the same data format as the input data format
+            image_patches = self.get_image_patches(
+                image,
+                image_grid_pinpoints,
+                size=(size["shortest_edge"], size["shortest_edge"])
+                if "shortest_edge" in size
+                else (min(size["height"], size["width"]), min(size["height"], size["width"])),
+                patch_size=crop_size["height"],
+                resample=resample,
+                data_format=input_data_format,
+                input_data_format=input_data_format,
+            )
+
+            # preprocess patches
+            pixel_values = self._preprocess(
+                image_patches,
+                do_resize=do_resize,
+                size=size,
+                resample=resample,
+                do_center_crop=do_center_crop,
+                crop_size=crop_size,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            pixel_values = np.array(pixel_values)
+            processed_images.append(pixel_values)
+
+        if do_pad:
+            processed_images = self._pad_for_batching(processed_images)
+
+        return BatchFeature(
+            data={"pixel_values": processed_images, "image_sizes": image_sizes}, tensor_type=return_tensors
+        )
+
+
+__all__ = ["LlavaNextImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/image_processing_llava_next_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/image_processing_llava_next_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..df20e2b90e8323ff17ed2c80d6d5369aba85b428
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/image_processing_llava_next_fast.py
@@ -0,0 +1,281 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for LLaVa-NeXT."""
+
+from typing import Optional, Union
+
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils import BatchFeature, get_patch_output_size, select_best_resolution
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    divide_to_patches,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+    get_image_size,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+)
+
+
+class LlavaNextFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    image_grid_pinpoints (`list[list[int]]`, *optional*):
+        A list of possible resolutions to use for processing high resolution images. The best resolution is selected
+        based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
+        method.
+    """
+
+    image_grid_pinpoints: Optional[list[list[int]]]
+
+
+@auto_docstring
+class LlavaNextImageProcessorFast(BaseImageProcessorFast):
+    # To be checked against the slow image processor
+    # None values left after checking can be removed
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"shortest_edge": 224}
+    default_to_square = False
+    crop_size = {"height": 224, "width": 224}
+    do_resize = True
+    do_center_crop = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    do_pad = True
+    image_grid_pinpoints = [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
+    valid_kwargs = LlavaNextFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaNextFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def _resize_for_patching(
+        self,
+        image: "torch.Tensor",
+        target_resolution: tuple,
+        interpolation: "F.InterpolationMode",
+        input_data_format: ChannelDimension,
+    ) -> "torch.Tensor":
+        """
+        Resizes an image to a target resolution while maintaining aspect ratio.
+
+        Args:
+            image ("torch.Tensor"):
+                The input image.
+            target_resolution (tuple):
+                The target resolution (height, width) of the image.
+            interpolation (`InterpolationMode`):
+                Resampling filter to use if resizing the image.
+            input_data_format (`ChannelDimension` or `str`):
+                The channel dimension format of the input image.
+
+        Returns:
+            "torch.Tensor": The resized and padded image.
+        """
+        new_height, new_width = get_patch_output_size(image, target_resolution, input_data_format)
+
+        # Resize the image
+        resized_image = self.resize(
+            image=image,
+            size=SizeDict(height=new_height, width=new_width),
+            interpolation=interpolation,
+        )
+
+        return resized_image
+
+    def _get_padding_size(self, original_resolution: tuple, target_resolution: tuple):
+        original_height, original_width = original_resolution
+        target_height, target_width = target_resolution
+        paste_x, r_x = divmod(target_width - original_width, 2)
+        paste_y, r_y = divmod(target_height - original_height, 2)
+        return [paste_x, paste_y, paste_x + r_x, paste_y + r_y]
+
+    def _pad_for_patching(
+        self, image: "torch.Tensor", target_resolution: tuple, input_data_format: ChannelDimension
+    ) -> "torch.Tensor":
+        """
+        Pad an image to a target resolution while maintaining aspect ratio.
+        """
+        new_resolution = get_patch_output_size(image, target_resolution, input_data_format)
+        padding = self._get_padding_size(new_resolution, target_resolution)
+
+        padded_image = F.pad(image, padding=padding)
+
+        return padded_image
+
+    def _get_image_patches(
+        self,
+        image: "torch.Tensor",
+        grid_pinpoints,
+        size: tuple,
+        patch_size: int,
+        interpolation: "F.InterpolationMode",
+    ) -> list["torch.Tensor"]:
+        """
+        Process an image with variable resolutions by dividing it into patches.
+
+        Args:
+            image ("torch.Tensor"):
+                The input image to be processed.
+            grid_pinpoints (List):
+                A string representation of a list of possible resolutions.
+            size (`tuple`):
+                Size to resize the original image to.
+            patch_size (`int`):
+                Size of the patches to divide the image into.
+            interpolation (`"InterpolationMode"`):
+                Resampling filter to use if resizing the image.
+
+        Returns:
+            list["torch.Tensor"]: A list of NumPy arrays containing the processed image patches.
+        """
+        if not isinstance(grid_pinpoints, list):
+            raise TypeError("grid_pinpoints must be a list of possible resolutions.")
+
+        possible_resolutions = grid_pinpoints
+
+        image_size = get_image_size(image, channel_dim=ChannelDimension.FIRST)
+        best_resolution = select_best_resolution(image_size, possible_resolutions)
+        resized_image = self._resize_for_patching(
+            image, best_resolution, interpolation=interpolation, input_data_format=ChannelDimension.FIRST
+        )
+        padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=ChannelDimension.FIRST)
+        patches = divide_to_patches(padded_image, patch_size=patch_size)
+        resized_original_image = F.resize(image, size=size, interpolation=interpolation)
+
+        image_patches = [resized_original_image] + patches
+
+        return image_patches
+
+    def _pad_for_batching(
+        self,
+        pixel_values: list["torch.Tensor"],
+    ) -> list["torch.Tensor"]:
+        """
+        Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.
+
+        Args:
+            pixel_values (`list[torch.Tensor]`):
+                An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`)
+
+        Returns:
+            list[`torch.Tensor`]: The padded images.
+        """
+        max_patch = max(len(x) for x in pixel_values)
+        pixel_values = [
+            torch.nn.functional.pad(image, pad=[0, 0, 0, 0, 0, 0, 0, max_patch - image.shape[0]])
+            for image in pixel_values
+        ]
+
+        return pixel_values
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        image_grid_pinpoints: list[list[int]],
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        do_pad: bool,
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        processed_images = []
+        image_sizes = []
+        # Determine the size tuple
+        if size and size.height and size.width:
+            size_tuple = (size.height, size.width)
+        else:
+            size_tuple = (size.shortest_edge, size.shortest_edge)
+
+        # Determine the patch size
+        if crop_size and crop_size.height:
+            patch_size = crop_size.height
+        elif size and size.height:
+            patch_size = size.height
+        else:
+            patch_size = size.shortest_edge
+
+        for image in images:
+            image_patches = self._get_image_patches(
+                image,
+                image_grid_pinpoints,
+                size=size_tuple,
+                patch_size=patch_size,
+                interpolation=interpolation,
+            )
+
+            # Group images by size for batched processing
+            processed_image_patches_grouped = {}
+            grouped_image_patches, grouped_image_patches_index = group_images_by_shape(
+                image_patches, disable_grouping=disable_grouping
+            )
+            for shape, stacked_image_patches in grouped_image_patches.items():
+                if do_resize:
+                    stacked_image_patches = self.resize(
+                        image=stacked_image_patches,
+                        size=size,
+                        interpolation=interpolation,
+                    )
+                if do_center_crop:
+                    stacked_image_patches = self.center_crop(stacked_image_patches, crop_size)
+                # Fused rescale and normalize
+                stacked_image_patches = self.rescale_and_normalize(
+                    stacked_image_patches, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+                )
+                processed_image_patches_grouped[shape] = stacked_image_patches
+            processed_image_patches = reorder_images(processed_image_patches_grouped, grouped_image_patches_index)
+            processed_image_patches = (
+                torch.stack(processed_image_patches, dim=0) if return_tensors else processed_image_patches
+            )
+            processed_images.append(processed_image_patches)
+            image_sizes.append(get_image_size(image, ChannelDimension.FIRST))
+
+        if do_pad:
+            processed_images = self._pad_for_batching(processed_images)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+        return BatchFeature(
+            data={"pixel_values": processed_images, "image_sizes": image_sizes}, tensor_type=return_tensors
+        )
+
+
+__all__ = ["LlavaNextImageProcessorFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/modeling_llava_next.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/modeling_llava_next.py
new file mode 100644
index 0000000000000000000000000000000000000000..a75b4b7981078cd690ae7d063e5715dce8bf6696
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/modeling_llava_next.py
@@ -0,0 +1,793 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Llava-NeXT model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...image_processing_utils import select_best_resolution
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ..auto import AutoModel
+from .configuration_llava_next import LlavaNextConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
+    """
+    Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (`tuple`):
+            The size of the input image in the format (width, height).
+        grid_pinpoints (`List`):
+            A list containing possible resolutions. Each item in the list should be a tuple or list
+            of the form `(height, width)`.
+        patch_size (`int`):
+            The size of each image patch.
+
+    Returns:
+        tuple: The shape of the image patch grid in the format (width, height).
+    """
+    if not isinstance(grid_pinpoints, list):
+        raise TypeError("grid_pinpoints should be a list of tuples or lists")
+
+    # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
+    if not isinstance(image_size, (list, tuple)):
+        if not isinstance(image_size, (torch.Tensor, np.ndarray)):
+            raise TypeError(
+                f"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor"
+            )
+        image_size = image_size.tolist()
+
+    height, width = select_best_resolution(image_size, grid_pinpoints)
+    return height // patch_size, width // patch_size
+
+
+def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int):
+    """
+    Calculate the number of patches after the preprocessing for images of any resolution.
+
+    Args:
+        image_size (`torch.LongTensor` or `np.ndarray` or `tuple[int, int]`):
+            The size of the input image in the format (height, width). ?
+        grid_pinpoints (`List`):
+            A list containing possible resolutions. Each item in the list should be a tuple or list
+            of the form `(height, width)`.
+        patch_size (`int`):
+            The size of each image patch.
+
+    Returns:
+        int: the number of patches
+    """
+    if not isinstance(grid_pinpoints, list):
+        raise TypeError("grid_pinpoints should be a list of tuples or lists")
+
+    # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate
+    if not isinstance(image_size, (list, tuple)):
+        if not isinstance(image_size, (torch.Tensor, np.ndarray)):
+            raise TypeError(f"image_size invalid type {type(image_size)} with value {image_size}")
+        image_size = image_size.tolist()
+
+    best_resolution = select_best_resolution(image_size, grid_pinpoints)
+    height, width = best_resolution
+    num_patches = 0
+    # consider change to ceil(height/patch_size)*ceil(width/patch_size) + 1
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            num_patches += 1
+    # add the base patch
+    num_patches += 1
+    return num_patches
+
+
+def unpad_image(tensor, original_size):
+    """
+    Unpads a PyTorch tensor of a padded and resized image.
+
+    Args:
+        tensor (`torch.Tensor`):
+            The image tensor, assumed to be of shape (num_channels, height, width).
+        original_size (`tuple`):
+            The original size of the image (height, width).
+
+    Returns:
+        `torch.Tensor`: The unpadded image tensor.
+    """
+    if not isinstance(original_size, (list, tuple)):
+        if not isinstance(original_size, (torch.Tensor, np.ndarray)):
+            raise TypeError(
+                f"image_size invalid type: {type(original_size)} not valid, should be either list, tuple, np.ndarray or tensor"
+            )
+        original_size = original_size.tolist()
+    original_height, original_width = original_size
+    current_height, current_width = tensor.shape[1:]
+
+    original_aspect_ratio = original_width / original_height
+    current_aspect_ratio = current_width / current_height
+
+    if original_aspect_ratio > current_aspect_ratio:
+        scale_factor = current_width / original_width
+        new_height = int(round(original_height * scale_factor, 7))
+        padding = (current_height - new_height) // 2
+        unpadded_tensor = tensor[:, padding : current_height - padding, :]
+    else:
+        scale_factor = current_height / original_height
+        new_width = int(round(original_width * scale_factor, 7))
+        padding = (current_width - new_width) // 2
+        unpadded_tensor = tensor[:, :, padding : current_width - padding]
+
+    return unpadded_tensor
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Llava outputs, with hidden states and attentions.
+    """
+)
+class LlavaNextModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for LlavaNext causal language model (or autoregressive) outputs.
+    """
+)
+class LlavaNextCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+# Copied from transformers.models.llava.modeling_llava.LlavaMultiModalProjector with Llava->LlavaNext
+class LlavaNextMultiModalProjector(nn.Module):
+    def __init__(self, config: LlavaNextConfig):
+        super().__init__()
+        # We have hidden_size * the number of vision feature layers
+        num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * num_feature_layers,
+            config.text_config.hidden_size,
+            bias=config.multimodal_projector_bias,
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
+
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@auto_docstring
+class LlavaNextPreTrainedModel(PreTrainedModel):
+    config: LlavaNextConfig
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)
+
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, LlavaNextModel):
+            embed_std = 1 / math.sqrt(self.config.text_config.hidden_size)
+            module.image_newline.data.normal_(mean=0.0, std=embed_std)
+
+
+@auto_docstring(
+    custom_intro="""
+    The Llava-Next model which consists of a vision backbone and a language model without language modeling head.
+    """
+)
+class LlavaNextModel(LlavaNextPreTrainedModel):
+    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+
+    def __init__(self, config: LlavaNextConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+
+        self.multi_modal_projector = LlavaNextMultiModalProjector(config)
+        embed_std = 1 / math.sqrt(config.text_config.hidden_size)
+        self.image_newline = nn.Parameter(torch.randn(config.text_config.hidden_size, dtype=self.dtype) * embed_std)
+
+        self.vocab_size = config.text_config.vocab_size
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
+        """
+        Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
+
+        Args:
+            image_features (`list[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
+                List of image feature tensor, each contains all the visual feature of all patches.
+            image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
+                Actual image size of each images (H, W).
+            vision_feature_select_strategy (`str`)
+                The feature selection strategy used to select the vision feature from the vision backbone.
+            image_newline (`torch.Tensor` of shape `(embed_dim)`)
+                New line embedding vector.
+        Returns:
+            image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
+            feature_lens (`list[int]`)
+                token length of each image in image_features
+        """
+        new_image_features = []
+        feature_lens = []
+        for image_idx, image_feature in enumerate(image_features):
+            if image_feature.shape[0] > 1:
+                base_image_feature = image_feature[0]
+                image_feature = image_feature[1:]
+                height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
+
+                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+                    image_sizes[image_idx],
+                    self.config.image_grid_pinpoints,
+                    self.config.vision_config.image_size,
+                )
+
+                if (
+                    np.prod(image_feature.shape) % (num_patch_height * num_patch_width * height * width) != 0
+                    and vision_feature_select_strategy == "default"
+                ):
+                    logger.warning_once(
+                        "Image feature shape does not line up with the provided patch size. "
+                        "You may be using the `default` vision_feature_select_strategy with a"
+                        " visual encoder that does not have CLS."
+                    )
+
+                image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
+                image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+                image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+                image_feature = unpad_image(image_feature, image_sizes[image_idx])
+                if image_newline is not None:
+                    image_feature = torch.cat(
+                        (
+                            image_feature,
+                            image_newline[:, None, None]
+                            .expand(*image_feature.shape[:-1], 1)
+                            .to(image_feature.device, image_feature.dtype),
+                        ),
+                        dim=-1,
+                    )
+                image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+            else:
+                image_feature = image_feature[0]
+                if image_newline is not None:
+                    image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
+            new_image_features.append(image_feature)
+            feature_lens.append(image_feature.size(0))
+        feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features[0].device)
+        return new_image_features, feature_lens
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        image_sizes: torch.Tensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_patches, channels, height, width)`)
+               The tensors corresponding to the input images.
+            image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
+                Actual image size of each images (H, W).
+            vision_feature_layer (`Union[int, list[int]]`, *optional*):
+                The index of the layer to select the vision feature. If multiple indices are provided,
+                the vision feature of the corresponding indices will be concatenated to form the
+                vision features.
+            vision_feature_select_strategy (`str`, *optional*):
+                The feature selection strategy used to select the vision feature from the vision backbone.
+                Can be one of `"default"` or `"full"`
+        Returns:
+            image_features (list[`torch.Tensor`]): List of image feature tensor, each contains all the visual feature of all patches
+            and are of shape `(num_patches, image_length, embed_dim)`).
+        """
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        # ! infer image_num_patches from image_sizes
+        image_num_patches = [
+            image_size_to_num_patches(
+                image_size=imsize,
+                grid_pinpoints=self.config.image_grid_pinpoints,
+                patch_size=self.config.vision_config.image_size,
+            )
+            for imsize in image_sizes
+        ]
+        if pixel_values.dim() == 5:
+            # stacked if input is (batch_size, num_patches, num_channels, height, width)
+            _pixel_values_list = [pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)]
+            pixel_values = torch.cat(_pixel_values_list, dim=0)
+        elif pixel_values.dim() != 4:
+            # otherwise has to be stacked from list of (num_patches, num_channels, height, width)
+            raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")
+
+        image_features = self.vision_tower(pixel_values, output_hidden_states=True)
+        # If we have one vision feature layer, return the corresponding hidden states,
+        # otherwise, select the hidden states of each feature layer and concatenate them
+        if isinstance(vision_feature_layer, int):
+            selected_image_feature = image_features.hidden_states[vision_feature_layer]
+        else:
+            hs_pool = [image_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
+            selected_image_feature = torch.cat(hs_pool, dim=-1)
+
+        if vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+
+        image_features = self.multi_modal_projector(selected_image_feature)
+        image_features = torch.split(image_features, image_num_patches, dim=0)
+
+        # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
+        image_features, feature_lens = self.pack_image_features(
+            image_features,
+            image_sizes,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            image_newline=self.image_newline,
+        )
+        return image_features
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, LlavaNextModelOutputWithPast]:
+        r"""
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
+            If `"full"`, the full vision features are used.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None and pixel_values.size(0) > 0:
+            image_features = self.get_image_features(
+                pixel_values,
+                image_sizes,
+                vision_feature_layer=vision_feature_layer,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+            )
+            image_features = torch.cat(image_features, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return LlavaNextModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The LLAVA-NeXT model which consists of a vision backbone and a language model.
+    """
+)
+class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_tower": "model.vision_tower",
+        "^multi_modal_projector": "model.multi_modal_projector",
+        "^image_newline": "model.image_newline",
+        "^language_model.lm_head": "lm_head",
+    }
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: LlavaNextConfig):
+        super().__init__(config)
+        self.model = LlavaNextModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def pack_image_features(self, image_features, image_sizes, vision_feature_select_strategy, image_newline=None):
+        return self.model.pack_image_features(
+            image_features=image_features,
+            image_sizes=image_sizes,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            image_newline=image_newline,
+        )
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        image_sizes: torch.Tensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+    ):
+        return self.model.get_image_features(
+            pixel_values=pixel_values,
+            image_sizes=image_sizes,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+        )
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, LlavaNextCausalLMOutputWithPast]:
+        r"""
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
+            If `"full"`, the full vision features are used.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, LlavaNextForConditionalGeneration
+
+        >>> model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+
+        >>> prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_length=30)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "[INST]  \nWhat is shown in this image? [/INST] The image appears to be a radar chart, which is a type of multi-dimensional plot (...)"
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        outputs = self.model(
+            input_ids,
+            pixel_values=pixel_values,
+            image_sizes=image_sizes,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return LlavaNextCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        image_sizes=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["image_sizes"] = image_sizes
+
+        return model_inputs
+
+    @staticmethod
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+__all__ = ["LlavaNextForConditionalGeneration", "LlavaNextPreTrainedModel", "LlavaNextModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/processing_llava_next.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/processing_llava_next.py
new file mode 100644
index 0000000000000000000000000000000000000000..2574fc443519f928a1d8f14bed08ae15466950c5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/processing_llava_next.py
@@ -0,0 +1,266 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for LLaVa-NeXT.
+"""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils import select_best_resolution
+from ...image_utils import ImageInput, get_image_size, to_numpy_array
+from ...processing_utils import (
+    MultiModalData,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+)
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class LlavaNextProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_mm_token_type_ids": False,
+        },
+        "images_kwargs": {
+            "do_pad": True,
+        },
+    }
+
+
+class LlavaNextProcessor(ProcessorMixin):
+    r"""
+    Constructs a LLaVa-NeXT processor which wraps a LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor.
+
+    [`LlavaNextProcessor`] offers all the functionalities of [`LlavaNextImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`LlavaNextImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Should be same as in model's config
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size=None,
+        vision_feature_select_strategy=None,
+        chat_template=None,
+        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        num_additional_image_tokens=0,
+        **kwargs,
+    ):
+        self.patch_size = patch_size
+        self.num_additional_image_tokens = num_additional_image_tokens
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[LlavaNextProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify at least images or text.")
+
+        output_kwargs = self._merge_kwargs(
+            LlavaNextProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+        else:
+            image_inputs = {}
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise TypeError("Invalid input text. Please provide a string, or a list of strings")
+
+        prompt_strings = text
+        if image_inputs:
+            image_sizes = iter(image_inputs["image_sizes"])
+            height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
+            prompt_strings = []
+            for sample in text:
+                while self.image_token in sample:
+                    image_size = next(image_sizes)
+                    if not isinstance(image_size, (list, tuple)):
+                        # cast to list to avoid numerical precision errors when calculating unpadding
+                        image_size = image_size.tolist()
+                    orig_height, orig_width = image_size
+                    num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
+                    if self.vision_feature_select_strategy == "default":
+                        num_image_tokens -= 1
+                    sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
+                prompt_strings.append(sample)
+            prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
+        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+
+    def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
+        image_grid_pinpoints = self.image_processor.image_grid_pinpoints
+
+        height_best_resolution, width_best_resolution = select_best_resolution(
+            [orig_height, orig_width], image_grid_pinpoints
+        )
+        scale_height, scale_width = height_best_resolution // height, width_best_resolution // width
+
+        patches_height = height // self.patch_size
+        patches_width = width // self.patch_size
+        unpadded_features, newline_features = self._get_unpadded_features(
+            orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
+        )
+        # The base patch covers the entire image (+1 for the CLS)
+        base_features = patches_height * patches_width + self.num_additional_image_tokens
+        num_image_tokens = unpadded_features + newline_features + base_features
+        return num_image_tokens
+
+    def _get_unpadded_features(self, height, width, patches_height, patches_width, scale_height, scale_width):
+        """
+        Get number of features for a given image with height/width. LLaVA-NeXT is different from LLaVA
+        because it divided each image into patches depending on its resolution. Therefore we need to calculate how many
+        patches an image is divided into and get the number of features from that.
+        """
+        current_height = patches_height * scale_height
+        current_width = patches_width * scale_width
+
+        original_aspect_ratio = width / height
+        current_aspect_ratio = current_width / current_height
+        if original_aspect_ratio > current_aspect_ratio:
+            new_height = int(round(height * (current_width / width), 7))
+            padding = (current_height - new_height) // 2
+            current_height -= padding * 2
+        else:
+            new_width = int(round(width * (current_height / height), 7))
+            padding = (current_width - new_width) // 2
+            current_width -= padding * 2
+
+        unpadded_features = current_height * current_width
+        newline_features = current_height
+        return (unpadded_features, newline_features)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+        Args:
+            image_sizes (list[list[str]], *optional*):
+                The input sizes formatted as (height, width) per each image.
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = LlavaNextProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+
+            size = images_kwargs.get("size", None) or self.image_processor.size
+            size = (
+                (size["shortest_edge"], size["shortest_edge"])
+                if "shortest_edge" in size
+                else (min(size["height"], size["width"]), min(size["height"], size["width"]))
+            )
+            processed_height, processed_width = size
+
+            batch_num_image_tokens = []
+            num_image_patches = [1] * len(image_sizes)  # llava-next doesn't batch pixels as Idefics, thus `1` patch`
+            for image_size in image_sizes:
+                orig_height, orig_width = image_size
+                num_image_tokens = self._get_number_of_features(
+                    orig_height, orig_width, processed_height, processed_width
+                )
+                if self.vision_feature_select_strategy == "default":
+                    num_image_tokens -= 1
+                batch_num_image_tokens.append(num_image_tokens)
+            vision_data.update({"num_image_tokens": batch_num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
+
+__all__ = ["LlavaNextProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9dfbb212df4f3cc9528f7fb22d0b19a17b74efe
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/configuration_llava_next_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/configuration_llava_next_video.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72be050034b1eea21d2c90fab4440475c69a8789
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/configuration_llava_next_video.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/image_processing_llava_next_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/image_processing_llava_next_video.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f39b198640566d3e91bd73b61d6eb95c896357e7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/image_processing_llava_next_video.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/modeling_llava_next_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/modeling_llava_next_video.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..317404c2ed0b0dea6200e61404222aa84155ad15
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/modeling_llava_next_video.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/modular_llava_next_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/modular_llava_next_video.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d5b63be3535a8f880650dde9873ab26430f5c5f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/modular_llava_next_video.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/processing_llava_next_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/processing_llava_next_video.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4db0b7f5edc60f499d146b5108d84e1907976d4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/processing_llava_next_video.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/video_processing_llava_next_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/video_processing_llava_next_video.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c45bdc7715293b882c1f381e4ec47a68449b8780
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next_video/__pycache__/video_processing_llava_next_video.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/modeling_longt5.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/modeling_longt5.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea6ab0cfff35097a62cf665da6c37742553083fa
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/modeling_longt5.py
@@ -0,0 +1,2227 @@
+# coding=utf-8
+# Copyright 2022 Google LLC., LongT5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch LongT5 model."""
+
+import copy
+import math
+import warnings
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    auto_docstring,
+    is_torch_flex_attn_available,
+    is_torch_fx_proxy,
+    is_torchdynamo_compiling,
+    logging,
+)
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_longt5 import LongT5Config
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+# TODO: Update before the merge
+
+
+def _pad_to_multiple(x: torch.Tensor, block_len: int, dim: int, pad_value: int = 0) -> torch.Tensor:
+    """Pad a tensor so that a sequence length will be a multiple of `block_len`"""
+    pad_len = -x.shape[dim] % block_len
+    # Handle cases when an empty input sequence is given
+    if not all(x.shape):
+        new_shape = list(x.shape)
+        new_shape[dim] += pad_len
+        return torch.zeros(new_shape, dtype=x.dtype)
+
+    pad = [(0, 0)] * x.ndim
+    pad[dim] = (0, pad_len)
+    pad = sum(pad[::-1], ())
+    x = nn.functional.pad(x, pad=pad, mode="constant", value=pad_value)
+    return x
+
+
+def _split_into_blocks(x: torch.Tensor, block_len: int, dim: int) -> torch.Tensor:
+    """Split an input tensor into blocks of a given `block_len` along the given `dim`. If the dimension length
+    is not a multiple of `block_len`, it will be padded first with selected `pad_value`.
+    """
+    # pad tensor to multiple of block_len
+    if x.shape[dim] % block_len != 0:
+        x = _pad_to_multiple(x, block_len, dim, pad_value=0)
+    num_blocks = x.shape[dim] // block_len
+    output_shape = x.shape[:dim] + (num_blocks, block_len) + x.shape[(dim + 1) :]
+    # If 0 is in output_shape, we cannot apply reshape because of incompatibility with ONNX conversion
+    if 0 in output_shape:
+        return torch.empty(output_shape, dtype=x.dtype, device=x.device)
+    return x.reshape(output_shape)
+
+
+def _concatenate_3_blocks(x: torch.Tensor, block_dim: int, sequence_dim: int, pad_value: int = 0) -> torch.Tensor:
+    """Concatenate three consecutive blocks for each input block for local attentiont.
+
+    For more information, see: https://huggingface.co/papers/2112.07916.
+    """
+    num_blocks = x.shape[block_dim]
+
+    pad = [(0, 0)] * x.ndim
+    pad[block_dim] = (1, 1)
+    pad = sum(pad[::-1], ())
+    # [batch_size, num_blocks, block_len] -> [batch_size, num_blocks + 2, block_len]
+    x = nn.functional.pad(x, pad=pad, mode="constant", value=pad_value)
+
+    blocks_list: list[torch.Tensor] = []
+    for i in range(3):
+        # We use indexing approach here:
+        # https://numpy.org/doc/stable/user/basics.indexing.html#dealing-with-variable-numbers-of-indices-within-programs
+        indices = [slice(0, None)] * x.ndim
+        indices[block_dim] = slice(i, i + num_blocks)
+        indices = tuple(indices)
+        blocks_list.append(x[indices])
+    # [batch_size, num_blocks, 3 * block_len, ...]
+    return torch.cat(blocks_list, dim=sequence_dim)
+
+
+def _make_3block_relative_position_ids(block_len: int) -> torch.Tensor:
+    """Makes 3-blocked relative position ids for local attention."""
+    position_ids = torch.arange(3 * block_len, dtype=torch.int32)
+    center_position_ids = position_ids[block_len:-block_len]
+    # [block_len, 3 * block_len]
+    relative_position_ids = position_ids.unsqueeze(0) - center_position_ids.unsqueeze(1)
+    return relative_position_ids
+
+
+def _mask_local_attention_mask(local_attention_mask: torch.Tensor, block_len: int) -> torch.Tensor:
+    """Mask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius."""
+    relative_position_ids = _make_3block_relative_position_ids(block_len)
+    locality_mask = torch.abs(relative_position_ids) < block_len
+    locality_mask = locality_mask[None, None, :, :]
+    locality_mask = locality_mask.to(local_attention_mask.device)
+    return torch.logical_and(local_attention_mask, locality_mask)
+
+
+def _get_local_attention_mask(attention_mask: torch.Tensor, block_len: int, device: torch.device) -> torch.Tensor:
+    """Prepare attention mask to be applied for a local attention."""
+    # [batch_size, num_blocks, block_len]
+    _blocked_attention_mask = _split_into_blocks(attention_mask, block_len, dim=1)
+    # [batch_size, num_block, 3 * block_len]
+    _3blocked_attention_mask = _concatenate_3_blocks(_blocked_attention_mask, block_dim=1, sequence_dim=2)
+
+    _blocked_attention_mask = _blocked_attention_mask.unsqueeze(-1)
+    _3blocked_attention_mask = _3blocked_attention_mask.unsqueeze(-2)
+    # [batch_size, num_block, block_len, 3 * block_len]
+    local_attention_mask = torch.logical_and(_blocked_attention_mask, _3blocked_attention_mask)
+    local_attention_mask = _mask_local_attention_mask(local_attention_mask, block_len)
+    # [batch_size, 1, num_block, block_len, 3 * block_len]
+    return local_attention_mask.unsqueeze(1).to(device)
+
+
+def _make_global_fixed_block_ids(
+    attention_mask: torch.Tensor, global_block_size: int
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Obtain the "fixed block" global id corresponding to each input token.
+
+    This implementation is a simplified version of the original Flaxformr implementation adopted from:
+    https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.
+
+    In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
+    the whole fixed block, are assigned to the preceding block.
+
+    Padding tokens from the original sequence are represented by -1.
+    """
+    batch_size, seq_len = attention_mask.shape[:2]
+
+    def handle_orphan_tokens(block_ids: torch.Tensor) -> torch.Tensor:
+        block_ends = (torch.arange(seq_len) % global_block_size) == global_block_size - 1
+        block_ends = block_ends.to(block_ids.device)
+        true_block_ends = torch.logical_and(block_ends, block_ids >= 0)
+        full_blocks = true_block_ends.sum(-1).unsqueeze(-1).type(block_ids.dtype) - 1
+        block_ids = torch.where(block_ids < full_blocks, block_ids, full_blocks)
+        return block_ids
+
+    fixed_block_mask = torch.ones_like(attention_mask, device=attention_mask.device) / global_block_size
+    fixed_block_mask = torch.cumsum(fixed_block_mask, axis=1) - fixed_block_mask
+    mask = torch.where(attention_mask != 0.0, 1.0, -1000.0).type(attention_mask.dtype)
+    global_block_ids = torch.floor(mask + fixed_block_mask - 1.0).type(attention_mask.dtype)
+    _global_block_ids_lower_bound = torch.tensor(-1, dtype=global_block_ids.dtype, device=global_block_ids.device)
+    global_block_ids = torch.where(
+        global_block_ids > _global_block_ids_lower_bound, global_block_ids, _global_block_ids_lower_bound
+    )
+    # set padding tokens to -1
+    global_block_ids = (global_block_ids * attention_mask) + (attention_mask - 1)
+    # [batch_size, seq_len]
+    global_block_ids = handle_orphan_tokens(global_block_ids)
+    num_globals = seq_len // global_block_size
+    # [batch_size, seq_len // global_block_size]
+    if num_globals > 0:
+        _sequence_block_ids_max = torch.max(global_block_ids, dim=-1).values.repeat(num_globals, 1).transpose(0, 1)
+    else:
+        _sequence_block_ids_max = torch.zeros(
+            batch_size, 0, dtype=global_block_ids.dtype, device=global_block_ids.device
+        )
+    global_segment_ids = torch.cumsum(torch.ones(batch_size, num_globals), dim=-1) - 1
+    global_segment_ids = global_segment_ids.to(attention_mask.device)
+    global_segment_ids = torch.where(global_segment_ids <= _sequence_block_ids_max, 1, 0)
+    return global_block_ids.type(torch.int), global_segment_ids.type(torch.int)
+
+
+def _make_side_relative_position_ids(attention_mask: torch.Tensor, global_block_size: int) -> torch.Tensor:
+    """Create the relative position tensor for local -> global attention."""
+    block_ids, global_segment_ids = _make_global_fixed_block_ids(attention_mask, global_block_size)
+    global_seq_len = global_segment_ids.shape[-1]
+    global_positions = torch.arange(global_seq_len, device=block_ids.device)
+    side_relative_position = global_positions - block_ids[..., None]
+    return side_relative_position.type(torch.int64)
+
+
+def _create_global_aggregates(
+    hidden_states: torch.Tensor, block_ids: torch.Tensor, global_seq_len: int
+) -> torch.Tensor:
+    """Compute individual block aggregates by summing over individual blocks."""
+    # (batch..., seq_len, global_seq_len))
+    block_ids = block_ids.where(
+        block_ids >= 0, torch.tensor(global_seq_len, dtype=block_ids.dtype, device=block_ids.device)
+    )
+    one_hot_block_ids = nn.functional.one_hot(block_ids.type(torch.int64), global_seq_len + 1)[:, :, :-1]
+    return torch.einsum("...nd,...ng->...gd", hidden_states, one_hot_block_ids.type(hidden_states.dtype))
+
+
+# Copied from transformers.models.t5.modeling_t5.T5LayerNorm with T5->LongT5
+class LongT5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # LongT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://huggingface.co/papers/1910.07467 thus variance is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+try:
+    from apex.normalization import FusedRMSNorm
+
+    LongT5LayerNorm = FusedRMSNorm
+
+    logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNorm")
+except ImportError:
+    # using the normal LongT5LayerNorm
+    pass
+except Exception:
+    logger.warning("discovered apex but it failed to load, falling back to LongT5LayerNorm")
+    pass
+
+
+# Copied from transformers.models.t5.modeling_t5.T5DenseActDense with T5->LongT5
+class LongT5DenseActDense(nn.Module):
+    def __init__(self, config: LongT5Config):
+        super().__init__()
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN[config.dense_act_fn]
+
+    def forward(self, hidden_states):
+        hidden_states = self.wi(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        if (
+            isinstance(self.wo.weight, torch.Tensor)
+            and hidden_states.dtype != self.wo.weight.dtype
+            and self.wo.weight.dtype != torch.int8
+        ):
+            hidden_states = hidden_states.to(self.wo.weight.dtype)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class LongT5DenseGatedActDense(nn.Module):
+    def __init__(self, config: LongT5Config):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN[config.dense_act_fn]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.t5.modeling_t5.T5LayerFF with T5->LongT5
+class LongT5LayerFF(nn.Module):
+    def __init__(self, config: LongT5Config):
+        super().__init__()
+        if config.is_gated_act:
+            self.DenseReluDense = LongT5DenseGatedActDense(config)
+        else:
+            self.DenseReluDense = LongT5DenseActDense(config)
+
+        self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+# Copied from transformers.models.t5.modeling_t5.T5Attention with T5->LongT5
+class LongT5Attention(nn.Module):
+    def __init__(
+        self,
+        config: LongT5Config,
+        has_relative_attention_bias=False,
+        layer_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.relative_attention_max_distance = config.relative_attention_max_distance
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+        self.layer_idx = layer_idx
+        if layer_idx is None and self.is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+        self.gradient_checkpointing = False
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
+        )
+        # Prune linear layers
+        self.q = prune_linear_layer(self.q, index)
+        self.k = prune_linear_layer(self.k, index)
+        self.v = prune_linear_layer(self.v, index)
+        self.o = prune_linear_layer(self.o, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.inner_dim = self.key_value_proj_dim * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length, device=None, cache_position=None):
+        """Compute binned relative position bias"""
+        if device is None:
+            device = self.relative_attention_bias.weight.device
+        if cache_position is None:
+            context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
+        else:
+            context_position = cache_position[:, None].to(device)
+        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=(not self.is_decoder),
+            num_buckets=self.relative_attention_num_buckets,
+            max_distance=self.relative_attention_max_distance,
+        )
+        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        past_key_values=None,
+        layer_head_mask=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        # if key_value_states are provided this layer is used as a cross-attention layer for the decoder
+        is_cross_attention = key_value_states is not None
+
+        query_states = self.q(hidden_states)
+        query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+        # Check is encoder-decoder model is being used. Otherwise we'll get `DynamicCache`
+        is_updated = False
+        if isinstance(past_key_values, EncoderDecoderCache):
+            is_updated = past_key_values.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                curr_past_key_value = past_key_values.cross_attention_cache
+            else:
+                curr_past_key_value = past_key_values.self_attention_cache
+        else:
+            curr_past_key_value = past_key_values
+
+        current_states = key_value_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.layers[self.layer_idx].keys
+            value_states = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_states = self.k(current_states)
+            value_states = self.v(current_states)
+            key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+            value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        # compute scores, equivalent of torch.einsum("bnqd,bnkd->bnqk", query_states, key_states), compatible with onnx op>9
+        scores = torch.matmul(query_states, key_states.transpose(3, 2))
+
+        if position_bias is None:
+            key_length = key_states.shape[-2]
+            # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past)
+            real_seq_length = query_length if query_length is not None else cache_position[-1] + 1
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, self.n_heads, seq_length, key_length), device=scores.device, dtype=scores.dtype
+                )
+                if self.gradient_checkpointing and self.training:
+                    position_bias.requires_grad = True
+            else:
+                position_bias = self.compute_bias(
+                    real_seq_length, key_length, device=scores.device, cache_position=cache_position
+                )
+                position_bias = position_bias[:, :, -seq_length:, :]
+
+            if mask is not None:
+                causal_mask = mask[:, :, :, : key_states.shape[-2]]
+                position_bias = position_bias + causal_mask
+
+        if self.pruned_heads:
+            mask = torch.ones(position_bias.shape[1])
+            mask[list(self.pruned_heads)] = 0
+            position_bias_masked = position_bias[:, mask.bool()]
+        else:
+            position_bias_masked = position_bias
+
+        scores += position_bias_masked
+
+        # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask
+
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(batch_size, -1, self.inner_dim)
+        attn_output = self.o(attn_output)
+
+        outputs = (attn_output, position_bias)
+
+        if output_attentions:
+            outputs = outputs + (attn_weights,)
+        return outputs
+
+
+class LongT5LocalAttention(nn.Module):
+    def __init__(self, config: LongT5Config, has_relative_attention_bias: bool = False) -> None:
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.relative_attention_max_distance = config.relative_attention_max_distance
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.local_radius = config.local_radius
+        self.block_len = self.local_radius + 1
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+        self.gradient_checkpointing = False
+
+    # Copied from transformers.models.t5.modeling_t5.T5Attention.prune_heads
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
+        )
+        # Prune linear layers
+        self.q = prune_linear_layer(self.q, index)
+        self.k = prune_linear_layer(self.k, index)
+        self.v = prune_linear_layer(self.v, index)
+        self.o = prune_linear_layer(self.o, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.inner_dim = self.key_value_proj_dim * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @staticmethod
+    # Copied from transformers.models.t5.modeling_t5.T5Attention._relative_position_bucket
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
+        return relative_buckets
+
+    def compute_bias(self, block_length: int):
+        """Compute binned relative position bias"""
+        target_device = (
+            self.relative_attention_bias.weight.device
+            if self.relative_attention_bias.weight.device.type != "meta"
+            else None
+        )
+        memory_position = torch.arange(3 * block_length, dtype=torch.long, device=target_device)
+        context_position = memory_position[block_length:-block_length]
+
+        # (block_length, 3 * block_length)
+        relative_position = memory_position[None, :] - context_position[:, None]
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # (block_length, 3 * block_length)
+            bidirectional=(not self.is_decoder),
+            num_buckets=self.relative_attention_num_buckets,
+            max_distance=self.relative_attention_max_distance,
+        )
+        # (block_length, 3 * block_length, num_heads)
+        values = self.relative_attention_bias(relative_position_bucket)
+        # (1, 1, num_heads, block_length, 3 * block_length)
+        values = values.permute([2, 0, 1]).unsqueeze(0).unsqueeze(0)
+        return values
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        output_attentions=False,
+    ):
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        def shape(states):
+            """projection"""
+            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim)
+
+        def unshape(states):
+            """reshape"""
+            return states.contiguous().view(batch_size, -1, self.inner_dim)
+
+        # get query/key/value states -> (batch_size, seq_length, n_heads, dim_per_head)
+        query_states = shape(self.q(hidden_states))
+        key_states = shape(self.k(hidden_states))
+        value_states = shape(self.v(hidden_states))
+
+        # Split into blocks -> (batch_size, num_blocks, block_len, n_heads, dim_per_head)
+        query_states = _split_into_blocks(query_states, self.block_len, dim=1)
+        key_states = _split_into_blocks(key_states, self.block_len, dim=1)
+        value_states = _split_into_blocks(value_states, self.block_len, dim=1)
+
+        # Concatenate 3 blocks for keys and values -> (batch_size, num_blocks, 3 * block_len, n_heads, dim_per_head)
+        key_states = _concatenate_3_blocks(key_states, block_dim=1, sequence_dim=2)
+        value_states = _concatenate_3_blocks(value_states, block_dim=1, sequence_dim=2)
+
+        # Compute scores
+        scores = torch.einsum(
+            "...qhd,...khd->...hqk", query_states, key_states
+        )  # (batch_size, num_block, n_heads, block_len, 3 * block_len)
+
+        if position_bias is None:
+            # position_bias shape: # (1, 1, n_heads, block_len, 3 * block_len)
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, 1, self.n_heads, self.block_len, 3 * self.block_len), device=scores.device, dtype=scores.dtype
+                )
+                if self.gradient_checkpointing and self.training:
+                    position_bias.requires_grad = True
+            else:
+                position_bias = self.compute_bias(self.block_len)
+
+            if mask is not None:
+                # Replace masked positions with -1e10 (according to the original implementation)
+                mask = torch.where(mask > 0, 0.0, -1e10)
+                # We need to adjust position bias shape to be sum with mask
+                position_bias = position_bias + mask.transpose(1, 2)
+
+        scores += position_bias
+        # (batch_size, num_blocks, n_heads, block_len, 3 * block_len)
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)
+        # (batch_size, num_blocks, n_heads, block_len, 3 * block_len)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask
+        attn_weights = attn_weights.type(value_states.dtype)
+        attn_output = unshape(torch.einsum("...hqk,...khd->...qhd", attn_weights, value_states))
+        attn_output = attn_output[:, :seq_length, :]
+        attn_output = self.o(attn_output)
+
+        outputs = (
+            attn_output,
+            position_bias,
+        )
+
+        if output_attentions:
+            outputs = outputs + (attn_weights,)
+        return outputs
+
+
+class LongT5TransientGlobalAttention(nn.Module):
+    def __init__(self, config: LongT5Config, has_relative_attention_bias: bool = False) -> None:
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.relative_attention_max_distance = config.relative_attention_max_distance
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.local_radius = config.local_radius
+        self.block_len = self.local_radius + 1
+        self.global_block_size = config.global_block_size
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+
+        # Relativen attention bias & Layer norm for global attention
+        if self.has_relative_attention_bias:
+            self.global_relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+        self.global_input_layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+
+    # Copied from transformers.models.t5.modeling_t5.T5Attention.prune_heads
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
+        )
+        # Prune linear layers
+        self.q = prune_linear_layer(self.q, index)
+        self.k = prune_linear_layer(self.k, index)
+        self.v = prune_linear_layer(self.v, index)
+        self.o = prune_linear_layer(self.o, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.inner_dim = self.key_value_proj_dim * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @staticmethod
+    # Copied from transformers.models.t5.modeling_t5.T5Attention._relative_position_bucket
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
+        return relative_buckets
+
+    def compute_bias(self, block_length: int):
+        """Compute binned relative position bias"""
+        target_device = (
+            self.relative_attention_bias.weight.device
+            if self.relative_attention_bias.weight.device.type != "meta"
+            else None
+        )
+        memory_position = torch.arange(3 * block_length, dtype=torch.long, device=target_device)
+        context_position = memory_position[block_length:-block_length]
+
+        # (block_length, 3 * block_length)
+        relative_position = memory_position[None, :] - context_position[:, None]
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # (block_length, 3 * block_length)
+            bidirectional=(not self.is_decoder),
+            num_buckets=self.relative_attention_num_buckets,
+            max_distance=self.relative_attention_max_distance,
+        )
+        # (block_length, 3 * block_length, num_heads)
+        values = self.relative_attention_bias(relative_position_bucket)
+        # (1, 1, num_heads, block_length, 3 * block_length)
+        values = values.permute([2, 0, 1]).unsqueeze(0).unsqueeze(0)
+        return values
+
+    def compute_side_bias(self, mask: torch.Tensor, global_segment_ids: torch.Tensor) -> torch.Tensor:
+        # (batch_size, 1, seq_len, global_seq_len)
+        side_attention_mask = torch.eq(mask[..., None], global_segment_ids[:, None, :])[:, None, ...]
+        attention_side_bias = torch.where(side_attention_mask > 0, 0.0, -1e10)
+        # (batch_size, seq_len, global_seq_len)
+        side_relative_position = _make_side_relative_position_ids(mask, self.global_block_size)
+        side_relative_position_bucket = self._relative_position_bucket(
+            side_relative_position,
+            bidirectional=(not self.is_decoder),
+            num_buckets=self.relative_attention_num_buckets,
+            max_distance=self.relative_attention_max_distance,
+        )
+        # (batch_size, seq_len, global_seq_len, num_heads)
+        side_bias = self.global_relative_attention_bias(side_relative_position_bucket)
+
+        # (batch_size, num_heads, seq_len, global_seq_len)
+        side_bias = side_bias.permute([0, 3, 1, 2])
+        # (batch_size, num_heads, seq_len, global_seq_len)
+        attention_side_bias = attention_side_bias + side_bias
+        return attention_side_bias
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        output_attentions=False,
+    ):
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        def shape(states):
+            """projection"""
+            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim)
+
+        def unshape(states):
+            """reshape"""
+            return states.contiguous().view(batch_size, -1, self.inner_dim)
+
+        # Prepare components for transient-global attention
+        # Obtain block_ids and global_segment_ids
+        # global_seq_len := seq_len // self.global_block_size
+        # shapes: (batch_size, seq_len) & (batch_size, global_seq_len)
+        block_ids, global_segment_ids = _make_global_fixed_block_ids(
+            mask if mask is not None else torch.ones(hidden_states.shape[:-1]),
+            self.global_block_size,
+        )
+        # Create global inputs
+        _global_seq_len = global_segment_ids.shape[-1]
+        global_inputs = _create_global_aggregates(hidden_states, block_ids, _global_seq_len)
+        global_inputs = self.global_input_layer_norm(global_inputs)
+
+        # get query states -> (batch_size, seq_length, n_heads, dim_per_head)
+        query_states = shape(self.q(hidden_states))
+        key_states = shape(self.k(hidden_states))
+        value_states = shape(self.v(hidden_states))
+        # Get global/side key/value states  shape: (batch_size, global_seq_len, n_heads, dim_per_head)
+        side_key_states = shape(self.k(global_inputs))
+        side_value_states = shape(self.v(global_inputs))
+
+        # Split into blocks -> (batch_size, num_blocks, block_len, n_heads, dim_per_head)
+        query_states = _split_into_blocks(query_states, self.block_len, dim=1)
+        key_states = _split_into_blocks(key_states, self.block_len, dim=1)
+        value_states = _split_into_blocks(value_states, self.block_len, dim=1)
+
+        # Concatenate 3 blocks for keys and values -> (batch_size, num_blocks, 3 * block_len, n_heads, dim_per_head)
+        key_states = _concatenate_3_blocks(key_states, block_dim=1, sequence_dim=2)
+        value_states = _concatenate_3_blocks(value_states, block_dim=1, sequence_dim=2)
+
+        # Tile side inputs across local key/value blocks
+        # New shape: (batch_size, num_blocks, global_seq_len, n_heads, dim_per_head)
+        reps = [1] * (side_key_states.ndim + 1)
+        reps[1] = key_states.shape[1]
+        side_key_states = side_key_states.unsqueeze(1).repeat(reps)
+        side_value_states = side_value_states.unsqueeze(1).repeat(reps)
+
+        # Concatenate "local" and "side"/"global" key/value states to allow each token to attend global aggregated ones
+        # New shape: (batch_size, num_blocks, 3 * block_len + global_seq_len, n_heads, dim_per_head)
+        key_states = torch.cat([key_states, side_key_states], dim=2)
+        value_states = torch.cat([value_states, side_value_states], dim=2)
+
+        # Compute scores -> (batch_size, num_block, n_heads, block_len, 3 * block_len + global_seq_len)
+        scores = torch.einsum("...qhd,...khd->...hqk", query_states, key_states)
+
+        if mask is not None:
+            # We need to adjust position bias shape to be sum with mask
+            local_attention_mask = _get_local_attention_mask(mask, self.block_len, hidden_states.device)
+            # Replace masked positions with -10_000 (according to the original implementation)
+            local_attention_mask = torch.where(local_attention_mask > 0, 0.0, -1e10)
+        else:
+            local_attention_mask = None
+
+        if position_bias is None:
+            # position_bias shape: # (1, 1, n_heads, block_len, 3 * block_len)
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, 1, self.n_heads, self.block_len, 3 * self.block_len),
+                    device=scores.device,
+                    dtype=scores.dtype,
+                )
+                if self.gradient_checkpointing and self.training:
+                    position_bias.requires_grad = True
+            else:
+                position_bias = self.compute_bias(self.block_len)
+
+            if local_attention_mask is not None:
+                # (batch_size, 1, n_heads, block_len, 3 * block_len)
+                position_bias = position_bias + local_attention_mask.transpose(1, 2)
+            position_bias = position_bias.type(scores.dtype)
+
+            # Calculate global/side bias - shape: # (batch_size, num_heads, seq_len, global_seq_len)
+            if mask is None:
+                mask = torch.ones(batch_size, seq_length)
+            # (batch_size, num_heads, seq_len, global_seq_len)
+            side_position_bias = self.compute_side_bias(mask, global_segment_ids)
+            # (batch_size, num_blocks, num_heads, block_len, global_seq_len)
+            side_position_bias = _split_into_blocks(side_position_bias, self.block_len, dim=-2).transpose(1, 2)
+            side_position_bias = side_position_bias.type(scores.dtype).to(scores.device)
+            # (batch_size, num_blocks, num_heads, block_len, 3 * block_len + global_seq_len)
+            position_bias = torch.cat([position_bias, side_position_bias], dim=-1)
+
+        scores += position_bias
+        # (batch_size, num_blocks, n_heads, block_len, 3 * block_len + global_seq_len)
+        attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask
+        attn_weights = attn_weights.type(value_states.dtype)
+        attn_output = unshape(torch.einsum("...hqk,...khd->...qhd", attn_weights, value_states))
+        attn_output = attn_output[:, :seq_length, :]
+        attn_output = self.o(attn_output)
+
+        outputs = (attn_output, position_bias)
+
+        if output_attentions:
+            outputs = outputs + (attn_weights,)
+        return outputs
+
+
+# Copied from transformers.models.t5.modeling_t5.T5LayerSelfAttention with T5->LongT5
+class LongT5LayerSelfAttention(nn.Module):
+    def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.SelfAttention = LongT5Attention(
+            config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx
+        )
+        self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_values=None,
+        use_cache=False,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class LongT5LayerLocalSelfAttention(nn.Module):
+    """Local self attention used in encoder"""
+
+    def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.LocalSelfAttention = LongT5LocalAttention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        output_attentions=False,
+        **kwargs: Any,  # to accept past_key_values and use_cache kwargs
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.LocalSelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class LongT5LayerTransientGlobalSelfAttention(nn.Module):
+    """Transient-Global self attention used in encoder"""
+
+    def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.TransientGlobalSelfAttention = LongT5TransientGlobalAttention(
+            config, has_relative_attention_bias=has_relative_attention_bias
+        )
+        self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        output_attentions=False,
+        **kwargs: Any,  # to accept past_key_values and use_cache kwargs
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.TransientGlobalSelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.t5.modeling_t5.T5LayerCrossAttention with T5->LongT5
+class LongT5LayerCrossAttention(nn.Module):
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.EncDecAttention = LongT5Attention(config, has_relative_attention_bias=False, layer_idx=layer_idx)
+        self.layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        key_value_states,
+        attention_mask=None,
+        position_bias=None,
+        layer_head_mask=None,
+        past_key_values=None,
+        use_cache=False,
+        query_length=None,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            key_value_states=key_value_states,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            query_length=query_length,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class LongT5Block(GradientCheckpointingLayer):
+    def __init__(self, config, has_relative_attention_bias=False, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        if config.is_decoder:
+            attention_layer = LongT5LayerSelfAttention
+        elif config.encoder_attention_type == "local":
+            attention_layer = LongT5LayerLocalSelfAttention
+        elif config.encoder_attention_type == "transient-global":
+            attention_layer = LongT5LayerTransientGlobalSelfAttention
+        else:
+            raise ValueError(
+                "For encoder attention mechanism, either `local` or `transient-global` attention type is expected, "
+                f"but got {config.encoder_attention_type}."
+            )
+        self.layer = nn.ModuleList()
+        self.layer.append(
+            attention_layer(config, has_relative_attention_bias=has_relative_attention_bias, layer_idx=layer_idx)
+        )
+        if self.is_decoder:
+            self.layer.append(LongT5LayerCrossAttention(config, layer_idx=layer_idx))
+
+        self.layer.append(LongT5LayerFF(config))
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_values=None,
+        use_cache=False,
+        output_attentions=False,
+        return_dict=True,
+        cache_position=None,
+    ):
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            layer_head_mask=layer_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        hidden_states = self_attention_outputs[0]
+        attention_outputs = self_attention_outputs[1:]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 inference - check https://github.com/huggingface/transformers/pull/19229/
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+        if do_cross_attention:
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_values=past_key_values,
+                query_length=cache_position[-1] + 1,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 inference - check https://github.com/huggingface/transformers/pull/19229/
+            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[1:]
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+
+        # clamp inf values to enable fp16 inference - check https://github.com/huggingface/transformers/pull/19229/
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        return (
+            (hidden_states,) + attention_outputs
+        )  # hidden-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+
+
+@auto_docstring
+class LongT5PreTrainedModel(PreTrainedModel):
+    config: LongT5Config
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LongT5Block"]
+
+    _can_compile_fullgraph = False  # TODO: @raushan more involved due to local/global attn
+
+    @property
+    # Copied from transformers.models.t5.modeling_t5.T5PreTrainedModel.dummy_inputs
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+        return dummy_inputs
+
+    def _try_load_missing_tied_module(self, key):
+        module = self
+        key = key.removesuffix(".weight")
+        for sub_key in key.split("."):
+            if not hasattr(module, sub_key):
+                return
+            module = getattr(module, sub_key)
+
+        self._tie_or_clone_weights(module, self.shared)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requested_loading_info = kwargs.get("output_loading_info", False)
+        kwargs["output_loading_info"] = True
+        model, loading_info = super().from_pretrained(*args, **kwargs)
+        missing_keys = loading_info.get("missing_keys", [])
+
+        if hasattr(model, "shared") and hasattr(model, "_tied_weights_keys"):
+            for missing_key in missing_keys:
+                logger.warning(
+                    f"Recovering a missing tied weight {missing_key} from a legacy LongT5 checkpoint. "
+                    f"Consider saving {missing_key} in your checkpoint or updating the config (tie_word_embeddings=true)."
+                )
+                model._try_load_missing_tied_module(missing_key)
+
+        if requested_loading_info:
+            return model, loading_info
+        return model
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, LongT5LayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+        elif isinstance(module, (LongT5Model, LongT5ForConditionalGeneration, LongT5EncoderModel)):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.shared.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
+                module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, LongT5DenseActDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, "bias") and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, LongT5DenseGatedActDense):
+            module.wi_0.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi_0, "bias") and module.wi_0.bias is not None:
+                module.wi_0.bias.data.zero_()
+            module.wi_1.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi_1, "bias") and module.wi_1.bias is not None:
+                module.wi_1.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, (LongT5Attention, LongT5LocalAttention, LongT5TransientGlobalAttention)):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_kv
+            n_heads = self.config.num_heads
+            module.q.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.k.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+            module.v.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+            module.o.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
+            if module.has_relative_attention_bias:
+                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
+                if isinstance(module, LongT5TransientGlobalAttention):
+                    module.global_relative_attention_bias.weight.data.normal_(
+                        mean=0.0, std=factor * ((d_model) ** -0.5)
+                    )
+
+    # Copied from transformers.models.t5.modeling_t5.T5PreTrainedModel._shift_right with T5->LongT5
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. "
+                "See LongT5 docs for more information."
+            )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
+            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = decoder_start_token_id
+
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        return shifted_input_ids
+
+
+class LongT5Stack(LongT5PreTrainedModel):
+    def __init__(self, config, embed_tokens=None):
+        super().__init__(config)
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+        self.is_decoder = config.is_decoder
+
+        self.local_radius = config.local_radius
+        self.block_len = self.local_radius + 1
+
+        self.block = nn.ModuleList(
+            [
+                LongT5Block(config, has_relative_attention_bias=bool(i == 0), layer_idx=i)
+                for i in range(config.num_layers)
+            ]
+        )
+        self.final_layer_norm = LongT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.t5.modeling_t5.T5Stack.set_input_embeddings
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        inputs_embeds=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        cache_position=None,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if inputs_embeds is None:
+            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        batch_size, seq_length = input_shape
+
+        if self.is_decoder:
+            if use_cache and past_key_values is None:
+                if self.config.is_encoder_decoder:
+                    past_key_values = EncoderDecoderCache(
+                        DynamicCache(config=self.config), DynamicCache(config=self.config)
+                    )
+                else:
+                    past_key_values = DynamicCache(config=self.config)
+        elif not self.is_decoder:
+            # do not pass cache object down the line for encoder stack
+            # it messes indexing later in decoder-stack because cache object is modified in-place
+            past_key_values = None
+
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device
+            )
+
+        if attention_mask is None and not is_torchdynamo_compiling():
+            # required mask seq length can be calculated via length of past
+            mask_seq_length = past_key_values_length + seq_length
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+
+        if self.is_decoder:
+            causal_mask = self._update_causal_mask(
+                attention_mask,
+                inputs_embeds,
+                cache_position,
+                past_key_values.self_attention_cache
+                if isinstance(past_key_values, EncoderDecoderCache)
+                else past_key_values,
+                output_attentions,
+            )
+        # We use local attention in encoder self-attention, otherwise standard self & cross attentions are used
+        elif self.config.encoder_attention_type == "local":
+            causal_mask = _get_local_attention_mask(attention_mask, self.block_len, inputs_embeds.device)
+        else:  # we need to use both local attention mask and standard extended mask for transient-global attention
+            causal_mask = attention_mask
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.num_layers)
+        cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+        position_bias = None
+        encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(inputs_embeds)
+
+        for i, layer_module in enumerate(self.block):
+            layer_head_mask = head_mask[i]
+            cross_attn_layer_head_mask = cross_attn_head_mask[i]
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(
+                hidden_states,
+                causal_mask,
+                position_bias,
+                encoder_hidden_states,
+                encoder_extended_attention_mask,
+                encoder_decoder_position_bias,  # as a positional argument for gradient checkpointing
+                layer_head_mask=layer_head_mask,
+                cross_attn_layer_head_mask=cross_attn_layer_head_mask,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                return_dict=return_dict,
+                cache_position=cache_position,
+            )
+
+            # layer_outputs is a tuple with:
+            # hidden-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+
+            hidden_states = layer_outputs[0]
+
+            # We share the position biases between the layers - the first layer store them
+            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            position_bias = layer_outputs[1]
+            if self.is_decoder and encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[3 if output_attentions else 2]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[2],)
+                if self.is_decoder:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[4],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    past_key_values,
+                    all_hidden_states,
+                    all_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+# Warning message for FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+__HEAD_MASK_WARNING_MSG = """
+The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
+`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
+If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
+num_heads)`.
+"""
+
+
+@auto_docstring
+class LongT5Model(LongT5PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [
+        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
+    ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    def __init__(self, config: LongT5Config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.tie_encoder_decoder = False
+        self.encoder = LongT5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.tie_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = LongT5Stack(decoder_config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple[torch.FloatTensor], Seq2SeqModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
+            you should be able to pad the inputs on both the right and the left.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for detail.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
+            Training](./longt5#training).
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
+            Training](./longt5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
+            `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, LongT5Model
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
+        >>> model = LongT5Model.from_pretrained("google/long-t5-local-base")
+
+        >>> # Let's try a very long encoder input.
+        >>> input_ids = tokenizer(
+        ...     100 * "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+
+        >>> # forward pass
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    LONGT5 Model with a `language modeling` head on top.
+    """
+)
+class LongT5ForConditionalGeneration(LongT5PreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_unexpected = [
+        r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight",
+    ]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+
+    def __init__(self, config: LongT5Config):
+        super().__init__(config)
+        self.model_dim = config.d_model
+
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.tie_encoder_decoder = False
+        self.encoder = LongT5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.tie_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = LongT5Stack(decoder_config, self.shared)
+
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)
+
+    def get_encoder(self):
+        return self.encoder
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        decoder_head_mask: Optional[torch.FloatTensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.Tensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
+            you should be able to pad the inputs on both the right and the left.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for detail.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
+            Training](./longt5#training).
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
+            Training](./longt5#training).
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
+            `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
+            labels in `[0, ..., config.vocab_size]`
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
+        >>> model = LongT5ForConditionalGeneration.from_pretrained(
+        ...     "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"
+        ... )
+
+        >>> # Let's try a very long input.
+        >>> inputs = tokenizer(100 * "studies have shown that owning a dog is good for you ", return_tensors="pt")
+        >>> input_ids = inputs.input_ids
+
+        >>> outputs = model.generate(input_ids)
+        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+        abstractthe aim of this article is to provide an overview of the literature on the role of dog
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
+        if head_mask is not None and decoder_head_mask is None:
+            if self.config.num_layers == self.config.num_decoder_layers:
+                warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
+                decoder_head_mask = head_mask
+
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        hidden_states = encoder_outputs[0]
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            past_key_values=past_key_values,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        if self.config.tie_word_embeddings:
+            # Rescale output before projecting on vocab
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.model_dim**-0.5)
+
+        lm_logits = self.lm_head(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+
+            labels = labels.to(lm_logits.device)
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+            # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+
+        if not return_dict:
+            output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+
+@auto_docstring
+class LongT5EncoderModel(LongT5PreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight"]
+    _keys_to_ignore_on_load_unexpected = [r"decoder"]
+
+    def __init__(self, config: LongT5Config):
+        super().__init__(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.use_cache = False
+        encoder_config.tie_encoder_decoder = False
+        self.encoder = LongT5Stack(encoder_config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], BaseModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
+            you should be able to pad the inputs on both the right and the left.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for detail.
+
+            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
+            Training](./longt5#training).
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
+        >>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
+        >>> input_ids = tokenizer(
+        ...     100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return encoder_outputs
+
+
+__all__ = ["LongT5EncoderModel", "LongT5ForConditionalGeneration", "LongT5Model", "LongT5PreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d12684f6a69266ebebbf58dac512d70866e1cf11
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/configuration_luke.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/configuration_luke.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebdd9a128d61691d03d2e10d1375ae242ee9551a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/configuration_luke.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/tokenization_luke.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/tokenization_luke.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..629dd625d7623b6c89d81e077e1efdc7cf4825c3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/luke/__pycache__/tokenization_luke.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c456a6b88378a5461e11dadfafb8820698b6b9d0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_m2m_100 import *
+    from .modeling_m2m_100 import *
+    from .tokenization_m2m_100 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/configuration_m2m_100.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/configuration_m2m_100.py
new file mode 100644
index 0000000000000000000000000000000000000000..620641f1cf4e711bc54de79e02eb90328668c219
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/configuration_m2m_100.py
@@ -0,0 +1,284 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""M2M100 model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import Any, Optional
+
+from ... import PreTrainedTokenizer
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast
+from ...onnx.utils import compute_effective_axis_dimension
+from ...utils import TensorType, is_torch_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class M2M100Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`M2M100Model`]. It is used to instantiate an
+    M2M100 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the M2M100
+    [facebook/m2m100_418M](https://huggingface.co/facebook/m2m100_418M) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the M2M100 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`M2M100Model`] or
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+
+    Example:
+
+    ```python
+    >>> from transformers import M2M100Config, M2M100Model
+
+    >>> # Initializing a M2M100 facebook/m2m100_418M style configuration
+    >>> configuration = M2M100Config()
+
+    >>> # Initializing a model (with random weights) from the facebook/m2m100_418M style configuration
+    >>> model = M2M100Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "m2m_100"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+
+    def __init__(
+        self,
+        vocab_size=128112,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.05,
+        decoder_layerdrop=0.05,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=2,
+        scale_embedding=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+
+class M2M100OnnxConfig(OnnxSeq2SeqConfigWithPast):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+            ]
+        )
+
+        if self.use_past:
+            common_inputs["decoder_input_ids"] = {0: "batch"}
+            common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
+        else:
+            common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
+            common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
+
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+        return common_inputs
+
+    # Copied from BartOnnxConfig._generate_dummy_inputs_for_sequence_classification_and_question_answering
+    # A better name would be _generate_dummy_inputs_for_encoder_and_decoder because sequence classification and question
+    # answering are not supported for M2M100, but this name is preserved to be able to check that the copy matches what
+    # was done for BART so that it can be updated if need be.
+    def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        # Copied from OnnxConfig.generate_dummy_inputs
+        # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity.
+        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+        batch_size = compute_effective_axis_dimension(
+            batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
+        )
+
+        # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
+        token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
+        seq_length = compute_effective_axis_dimension(
+            seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
+        )
+
+        # Generate dummy inputs according to compute batch and sequence
+        dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
+        common_inputs = dict(tokenizer(dummy_input, return_tensors=framework))
+        return common_inputs
+
+    # Copied from transformers.models.bart.configuration_bart.BartOnnxConfig._generate_dummy_inputs_for_default_and_seq2seq_lm
+    def _generate_dummy_inputs_for_default_and_seq2seq_lm(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, seq_length, is_pair, framework
+        )
+
+        # Generate decoder inputs
+        decoder_seq_length = seq_length if not self.use_past else 1
+        decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, decoder_seq_length, is_pair, framework
+        )
+        decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
+        common_inputs = dict(**encoder_inputs, **decoder_inputs)
+
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+            batch, encoder_seq_length = common_inputs["input_ids"].shape
+            decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
+            num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
+            encoder_shape = (
+                batch,
+                num_encoder_attention_heads,
+                encoder_seq_length,
+                self._config.hidden_size // num_encoder_attention_heads,
+            )
+            decoder_past_length = decoder_seq_length + 3
+            decoder_shape = (
+                batch,
+                num_decoder_attention_heads,
+                decoder_past_length,
+                self._config.hidden_size // num_decoder_attention_heads,
+            )
+
+            common_inputs["decoder_attention_mask"] = torch.cat(
+                [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
+            )
+
+            common_inputs["past_key_values"] = []
+            # If the number of encoder and decoder layers are present in the model configuration, both are considered
+            num_encoder_layers, num_decoder_layers = self.num_layers
+            min_num_layers = min(num_encoder_layers, num_decoder_layers)
+            max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers
+            remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
+
+            for _ in range(min_num_layers):
+                common_inputs["past_key_values"].append(
+                    (
+                        torch.zeros(decoder_shape),
+                        torch.zeros(decoder_shape),
+                        torch.zeros(encoder_shape),
+                        torch.zeros(encoder_shape),
+                    )
+                )
+            # TODO: test this.
+            shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
+            for _ in range(min_num_layers, max_num_layers):
+                common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
+        return common_inputs
+
+    generate_dummy_inputs = _generate_dummy_inputs_for_default_and_seq2seq_lm
+
+
+__all__ = ["M2M100Config", "M2M100OnnxConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/modeling_m2m_100.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/modeling_m2m_100.py
new file mode 100644
index 0000000000000000000000000000000000000000..6015aa54d76b1ef6f7ccb326867ada6c287429a4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/modeling_m2m_100.py
@@ -0,0 +1,1445 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch M2M100 model."""
+
+import math
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+    _prepare_4d_attention_mask,
+    _prepare_4d_attention_mask_for_sdpa,
+)
+from ...modeling_flash_attention_utils import (
+    FlashAttentionKwargs,
+)
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, is_torch_flex_attn_available, is_torchdynamo_compiling, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_m2m_100 import M2M100Config
+
+
+if is_torch_flex_attn_available():
+    from ...integrations.flex_attention import BlockMask, make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+# Copied from transformers.models.bart.modeling_bart.BartScaledWordEmbedding with Bart->M2M100
+class M2M100ScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.embed_scale = embed_scale
+
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale
+
+
+class M2M100SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.register_buffer("weights", emb_weights, persistent=False)
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings.
+
+        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
+        "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+
+        return emb.to(torch.get_default_dtype())
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values_length: int = 0,
+    ):
+        if input_ids is not None:
+            bsz, seq_len = input_ids.size()
+            # Create the position ids from the input token ids. Any padded tokens remain padded.
+            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
+                input_ids.device
+            )
+        else:
+            bsz, seq_len = inputs_embeds.size()[:-1]
+            position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
+
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
+        if max_pos > self.weights.size(0):
+            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
+
+
+# Copied from transformers.models.bart.modeling_bart.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: Optional[float] = None,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    if scaling is None:
+        scaling = query.size(-1) ** -0.5
+
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask.view(1, -1, 1, 1)
+
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->M2M100
+class M2M100Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[M2M100Config] = None,
+        layer_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+        self.layer_idx = layer_idx
+        if layer_idx is None and self.is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.Tensor] = None,
+        # TODO: we need a refactor so that the different attention modules can get their specific kwargs
+        # ATM, we have mixed things encoder, decoder, and encoder-decoder attn
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        # determine input shapes
+        bsz, tgt_len = hidden_states.shape[:-1]
+        src_len = key_value_states.shape[1] if is_cross_attention else tgt_len
+
+        q_input_shape = (bsz, tgt_len, -1, self.head_dim)
+        kv_input_shape = (bsz, src_len, -1, self.head_dim)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states).view(*q_input_shape).transpose(1, 2)
+
+        is_updated = False
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = key_value_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.layers[self.layer_idx].keys
+            value_states = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(current_states)
+            value_states = self.v_proj(current_states)
+            key_states = key_states.view(*kv_input_shape).transpose(1, 2)
+            value_states = value_states.view(*kv_input_shape).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            output_attentions=output_attentions,
+            head_mask=layer_head_mask,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->M2M100, MBART->M2M100
+class M2M100EncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: M2M100Config):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = M2M100Attention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            config=config,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        return hidden_states, attn_weights
+
+
+# Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer with MBart->M2M100, MBART->M2M100
+class M2M100DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: M2M100Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = M2M100Attention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            is_causal=True,
+            config=config,
+            layer_idx=layer_idx,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = M2M100Attention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            config=config,
+            layer_idx=layer_idx,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_values (`Cache`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
+                cache in the correct position and to infer the complete sequence length.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+@auto_docstring
+class M2M100PreTrainedModel(PreTrainedModel):
+    config: M2M100Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["M2M100EncoderLayer", "M2M100DecoderLayer"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    # Doesn't support `compile` (dynamic control flow). Can be fixed but low usage model
+    _can_compile_fullgraph = False
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask
+    def _update_full_mask(
+        self,
+        attention_mask: Union[torch.Tensor, None],
+        inputs_embeds: torch.Tensor,
+    ):
+        if attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask if 0 in attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False)
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        return attention_mask
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: Optional[Union[torch.Tensor, "BlockMask"]],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+    ):
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            # Other attention flavors support in-built causal (when `mask is None`)
+            # while we need to create our specific block mask regardless
+            elif attention_mask is None:
+                attention_mask = make_flex_block_causal_mask(
+                    torch.ones(
+                        size=(input_tensor.shape[0], input_tensor.shape[1]),
+                        device=attention_mask.device,
+                    )
+                )
+            return attention_mask
+
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_cross_attn_mask
+    def _update_cross_attn_mask(
+        self,
+        encoder_hidden_states: Union[torch.Tensor, None],
+        encoder_attention_mask: Union[torch.Tensor, None],
+        input_shape: torch.Size,
+        inputs_embeds: torch.Tensor,
+    ):
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask,
+                    inputs_embeds.dtype,
+                    tgt_len=input_shape[-1],
+                )
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(encoder_attention_mask, torch.Tensor):
+                    encoder_attention_mask = make_flex_block_causal_mask(
+                        encoder_attention_mask,
+                        query_length=input_shape[-1],
+                        is_causal=False,
+                    )
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask(
+                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+
+        return encoder_attention_mask
+
+
+class M2M100Encoder(M2M100PreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`M2M100EncoderLayer`].
+
+    Args:
+        config: M2M100Config
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: M2M100Config, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        self.embed_tokens = M2M100ScaledWordEmbedding(
+            config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
+        )
+
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+
+        self.embed_positions = M2M100SinusoidalPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([M2M100EncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        embed_pos = self.embed_positions(input_ids, inputs_embeds)
+        embed_pos = embed_pos.to(inputs_embeds.device)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        attention_mask = self._update_full_mask(
+            attention_mask,
+            inputs_embeds,
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != len(self.layers):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and dropout_probability < self.layerdrop
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class M2M100Decoder(M2M100PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`M2M100DecoderLayer`]
+
+    Args:
+        config: M2M100Config
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: M2M100Config, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        self.embed_tokens = M2M100ScaledWordEmbedding(
+            config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
+        )
+
+        if embed_tokens is not None:
+            self.embed_tokens.weight = embed_tokens.weight
+
+        self.embed_positions = M2M100SinusoidalPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([M2M100DecoderLayer(config, layer_idx=i) for i in range(config.decoder_layers)])
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
+                cache in the correct position and to infer the complete sequence length.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # initialize `past_key_values`
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        batch_size, seq_length = inputs_embeds.size()[:-1]
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device
+            )
+
+        if attention_mask is None and not is_torchdynamo_compiling():
+            # required mask seq length can be calculated via length of past cache
+            mask_seq_length = past_key_values_length + seq_length
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+
+        self_attn_cache = (
+            past_key_values.self_attention_cache
+            if isinstance(past_key_values, EncoderDecoderCache)
+            else past_key_values
+        )
+
+        attention_mask = self._update_causal_mask(
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            self_attn_cache,
+        )
+        encoder_attention_mask = self._update_cross_attn_mask(
+            encoder_hidden_states,
+            encoder_attention_mask,
+            input_shape,
+            inputs_embeds,
+        )
+        # embed positions
+        positions = self.embed_positions(input_ids, inputs_embeds, past_key_values_length)
+        positions = positions.to(inputs_embeds.device)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != len(self.layers):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and dropout_probability < self.layerdrop
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_values=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                continue
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@auto_docstring
+class M2M100Model(M2M100PreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    def __init__(self, config: M2M100Config):
+        super().__init__(config)
+
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.shared = M2M100ScaledWordEmbedding(vocab_size, config.d_model, padding_idx, embed_scale=embed_scale)
+
+        self.encoder = M2M100Encoder(config, self.shared)
+        self.decoder = M2M100Decoder(config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared)
+
+    def get_encoder(self):
+        return self.encoder
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            M2M100 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The M2M100 Model with a language modeling head. Can be used for summarization.
+    """
+)
+class M2M100ForConditionalGeneration(M2M100PreTrainedModel, GenerationMixin):
+    base_model_prefix = "model"
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+
+    def __init__(self, config: M2M100Config):
+        super().__init__(config)
+        self.model = M2M100Model(config)
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], Seq2SeqLMOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            M2M100 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example Translation:
+
+        ```python
+        >>> from transformers import AutoTokenizer, M2M100ForConditionalGeneration
+
+        >>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M")
+
+        >>> text_to_translate = "Life is like a box of chocolates"
+        >>> model_inputs = tokenizer(text_to_translate, return_tensors="pt")
+
+        >>> # translate to French
+        >>> gen_tokens = model.generate(**model_inputs, forced_bos_token_id=tokenizer.get_lang_id("fr"))
+        >>> print(tokenizer.batch_decode(gen_tokens, skip_special_tokens=True))
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        lm_logits = self.lm_head(outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            # move labels to the correct device to enable PP
+            labels = labels.to(lm_logits.device)
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+__all__ = ["M2M100ForConditionalGeneration", "M2M100Model", "M2M100PreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/tokenization_m2m_100.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/tokenization_m2m_100.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccced10f2bacfb30aa03bf6f30f2e8f9343c64d3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/m2m_100/tokenization_m2m_100.py
@@ -0,0 +1,384 @@
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for M2M100."""
+
+import json
+import os
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Optional, Union
+
+import sentencepiece
+
+from ...tokenization_utils import BatchEncoding, PreTrainedTokenizer
+from ...utils import logging
+from ...utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "spm_file": "sentencepiece.bpe.model",
+    "tokenizer_config_file": "tokenizer_config.json",
+}
+
+
+# fmt: off
+FAIRSEQ_LANGUAGE_CODES = {
+    "m2m100": ["af", "am", "ar", "ast", "az", "ba", "be", "bg", "bn", "br", "bs", "ca", "ceb", "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "ff", "fi", "fr", "fy", "ga", "gd", "gl", "gu", "ha", "he", "hi", "hr", "ht", "hu", "hy", "id", "ig", "ilo", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "lb", "lg", "ln", "lo", "lt", "lv", "mg", "mk", "ml", "mn", "mr", "ms", "my", "ne", "nl", "no", "ns", "oc", "or", "pa", "pl", "ps", "pt", "ro", "ru", "sd", "si", "sk", "sl", "so", "sq", "sr", "ss", "su", "sv", "sw", "ta", "th", "tl", "tn", "tr", "uk", "ur", "uz", "vi", "wo", "xh", "yi", "yo", "zh", "zu"],
+    "wmt21": ['en', 'ha', 'is', 'ja', 'cs', 'ru', 'zh', 'de']
+}
+# fmt: on
+
+
+@requires(backends=("sentencepiece",))
+class M2M100Tokenizer(PreTrainedTokenizer):
+    """
+    Construct an M2M100 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        spm_file (`str`):
+            Path to [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
+            contains the vocabulary.
+        src_lang (`str`, *optional*):
+            A string representing the source language.
+        tgt_lang (`str`, *optional*):
+            A string representing the target language.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        language_codes (`str`, *optional*, defaults to `"m2m100"`):
+            What language codes to use. Should be one of `"m2m100"` or `"wmt21"`.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Examples:
+
+    ```python
+    >>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+
+    >>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+    >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ro")
+    >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
+    >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
+    >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
+    >>> outputs = model(**model_inputs)  # should work
+    ```"""
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    prefix_tokens: list[int] = []
+    suffix_tokens: list[int] = []
+
+    def __init__(
+        self,
+        vocab_file,
+        spm_file,
+        src_lang=None,
+        tgt_lang=None,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        pad_token="<pad>",
+        unk_token="<unk>",
+        language_codes="m2m100",
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        num_madeup_words=8,
+        **kwargs,
+    ) -> None:
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.language_codes = language_codes
+        fairseq_language_code = FAIRSEQ_LANGUAGE_CODES[language_codes]
+        self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in fairseq_language_code}
+
+        additional_special_tokens = kwargs.pop("additional_special_tokens", [])
+        for lang_code in fairseq_language_code:
+            token = self.get_lang_token(lang_code)
+            if token not in additional_special_tokens and lang_code not in str(token) not in self.added_tokens_encoder:
+                additional_special_tokens.append(token)
+
+        self.vocab_file = vocab_file
+        self.encoder = load_json(vocab_file)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.spm_file = spm_file
+        self.sp_model = load_spm(spm_file, self.sp_model_kwargs)
+
+        self.encoder_size = len(self.encoder)
+
+        self.lang_token_to_id = {
+            self.get_lang_token(lang_code): self.encoder_size + i for i, lang_code in enumerate(fairseq_language_code)
+        }
+        self.lang_code_to_id = {lang_code: self.encoder_size + i for i, lang_code in enumerate(fairseq_language_code)}
+        self.id_to_lang_token = {v: k for k, v in self.lang_token_to_id.items()}
+
+        self._src_lang = src_lang if src_lang is not None else "en"
+        self.tgt_lang = tgt_lang
+        self.cur_lang_id = self.get_lang_id(self._src_lang)
+
+        self.num_madeup_words = num_madeup_words
+
+        super().__init__(
+            src_lang=src_lang,
+            tgt_lang=tgt_lang,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            language_codes=language_codes,
+            sp_model_kwargs=self.sp_model_kwargs,
+            additional_special_tokens=additional_special_tokens,
+            num_madeup_words=num_madeup_words,
+            **kwargs,
+        )
+        self.set_src_lang_special_tokens(self._src_lang)
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder)
+
+    def get_vocab(self) -> dict:
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    @property
+    def src_lang(self) -> str:
+        return self._src_lang
+
+    @src_lang.setter
+    def src_lang(self, new_src_lang: str) -> None:
+        self._src_lang = new_src_lang
+        self.set_src_lang_special_tokens(self._src_lang)
+
+    def _tokenize(self, text: str) -> list[str]:
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        if token in self.lang_token_to_id:
+            return self.lang_token_to_id[token]
+        return self.encoder.get(token, self.encoder[self.unk_token])
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the decoder."""
+        if index in self.id_to_lang_token:
+            return self.id_to_lang_token[index]
+        return self.decoder.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string.strip()
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1] * len(self.suffix_tokens)
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An MBART sequence has the following format, where `X` represents the sequence:
+
+        - `input_ids` (for encoder) `X [eos, src_lang_code]`
+        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
+
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+
+    def __getstate__(self) -> dict:
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d: dict) -> None:
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        save_dir = Path(save_directory)
+        if not save_dir.is_dir():
+            raise OSError(f"{save_directory} should be a directory")
+        vocab_save_path = save_dir / (
+            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
+        )
+        spm_save_path = save_dir / (
+            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["spm_file"]
+        )
+
+        save_json(self.encoder, vocab_save_path)
+
+        if os.path.abspath(self.spm_file) != os.path.abspath(spm_save_path) and os.path.isfile(self.spm_file):
+            copyfile(self.spm_file, spm_save_path)
+        elif not os.path.isfile(self.spm_file):
+            with open(spm_save_path, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (str(vocab_save_path), str(spm_save_path))
+
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: list[str],
+        src_lang: str = "en",
+        tgt_texts: Optional[list[str]] = None,
+        tgt_lang: str = "ro",
+        **kwargs,
+    ) -> BatchEncoding:
+        self.src_lang = src_lang
+        self.tgt_lang = tgt_lang
+        self.set_src_lang_special_tokens(self.src_lang)
+        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
+
+    def _build_translation_inputs(self, raw_inputs, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs):
+        """Used by translation pipeline, to prepare inputs for the generate function"""
+        if src_lang is None or tgt_lang is None:
+            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
+        self.src_lang = src_lang
+        inputs = self(raw_inputs, add_special_tokens=True, **extra_kwargs)
+        tgt_lang_id = self.get_lang_id(tgt_lang)
+        inputs["forced_bos_token_id"] = tgt_lang_id
+        return inputs
+
+    def _switch_to_input_mode(self):
+        self.set_src_lang_special_tokens(self.src_lang)
+
+    def _switch_to_target_mode(self):
+        self.set_tgt_lang_special_tokens(self.tgt_lang)
+
+    def set_src_lang_special_tokens(self, src_lang: str) -> None:
+        """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
+        lang_token = self.get_lang_token(src_lang)
+        self.cur_lang_id = self.lang_token_to_id[lang_token]
+        self.prefix_tokens = [self.cur_lang_id]
+        self.suffix_tokens = [self.eos_token_id]
+
+    def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
+        """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
+        lang_token = self.get_lang_token(tgt_lang)
+        self.cur_lang_id = self.lang_token_to_id[lang_token]
+        self.prefix_tokens = [self.cur_lang_id]
+        self.suffix_tokens = [self.eos_token_id]
+
+    def get_lang_token(self, lang: str) -> str:
+        return self.lang_code_to_token[lang]
+
+    def get_lang_id(self, lang: str) -> int:
+        lang_token = self.get_lang_token(lang)
+        return self.lang_token_to_id[lang_token]
+
+
+def load_spm(path: str, sp_model_kwargs: dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
+    spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
+    spm.Load(str(path))
+    return spm
+
+
+def load_json(path: str) -> Union[dict, list]:
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def save_json(data, path: str) -> None:
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
+
+
+__all__ = ["M2M100Tokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7cd8c28da631fdd44cc09458380527b6323d044
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .tokenization_mbart50 import *
+    from .tokenization_mbart50_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/tokenization_mbart50.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/tokenization_mbart50.py
new file mode 100644
index 0000000000000000000000000000000000000000..413beaa03a83e4eddb9eb5a050d85c347ecbe2b5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/tokenization_mbart50.py
@@ -0,0 +1,359 @@
+# coding=utf-8
+# Copyright 2021 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from shutil import copyfile
+from typing import Any, Optional
+
+import sentencepiece as spm
+
+from ...tokenization_utils import AddedToken, BatchEncoding, PreTrainedTokenizer
+from ...utils import logging
+from ...utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
+
+
+FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"]  # fmt: skip
+
+
+@requires(backends=("sentencepiece",))
+class MBart50Tokenizer(PreTrainedTokenizer):
+    """
+    Construct a MBart50 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        src_lang (`str`, *optional*):
+            A string representing the source language.
+        tgt_lang (`str`, *optional*):
+            A string representing the target language.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Examples:
+
+    ```python
+    >>> from transformers import MBart50Tokenizer
+
+    >>> tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
+    >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
+    >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
+    >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
+    >>> # model(**model_inputs) should work
+    ```"""
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    prefix_tokens: list[int] = []
+    suffix_tokens: list[int] = []
+
+    def __init__(
+        self,
+        vocab_file,
+        src_lang=None,
+        tgt_lang=None,
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
+        kwargs["additional_special_tokens"] += [
+            code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
+        ]
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+
+        # Original fairseq vocab and spm vocab must be "aligned":
+        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
+        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
+        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
+        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
+
+        # Mimic fairseq token-to-id alignment for the first 4 token
+        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
+
+        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
+        self.fairseq_offset = 1
+
+        self.sp_model_size = len(self.sp_model)
+        self.lang_code_to_id = {
+            code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES)
+        }
+        self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}
+        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
+
+        self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+
+        super().__init__(
+            src_lang=src_lang,
+            tgt_lang=tgt_lang,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+        self._src_lang = src_lang if src_lang is not None else "en_XX"
+        self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
+        self.tgt_lang = tgt_lang
+        self.set_src_lang_special_tokens(self._src_lang)
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1  # Plus 1 for the mask token
+
+    @property
+    def src_lang(self) -> str:
+        return self._src_lang
+
+    @src_lang.setter
+    def src_lang(self, new_src_lang: str) -> None:
+        self._src_lang = new_src_lang
+        self.set_src_lang_special_tokens(self._src_lang)
+
+    def __getstate__(self) -> dict:
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d: dict) -> None:
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    def get_vocab(self) -> dict:
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text: str) -> list[str]:
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token: str) -> int:
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string.strip()
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1] * len(self.suffix_tokens)
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An MBART-50 sequence has the following format, where `X` represents the sequence:
+
+        - `input_ids` (for encoder) `[src_lang_code] X [eos]`
+        - `labels`: (for decoder) `[tgt_lang_code] X [eos]`
+
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+
+    def _build_translation_inputs(
+        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
+    ):
+        """Used by translation pipeline, to prepare inputs for the generate function"""
+        if src_lang is None or tgt_lang is None:
+            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
+        self.src_lang = src_lang
+        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
+        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
+        inputs["forced_bos_token_id"] = tgt_lang_id
+        return inputs
+
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: list[str],
+        src_lang: str = "en_XX",
+        tgt_texts: Optional[list[str]] = None,
+        tgt_lang: str = "ro_RO",
+        **kwargs,
+    ) -> BatchEncoding:
+        self.src_lang = src_lang
+        self.tgt_lang = tgt_lang
+        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
+
+    def _switch_to_input_mode(self):
+        return self.set_src_lang_special_tokens(self.src_lang)
+
+    def _switch_to_target_mode(self):
+        return self.set_tgt_lang_special_tokens(self.tgt_lang)
+
+    def set_src_lang_special_tokens(self, src_lang: str) -> None:
+        """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos]."""
+        self.cur_lang_code_id = self.lang_code_to_id[src_lang]
+        self.prefix_tokens = [self.cur_lang_code_id]
+        self.suffix_tokens = [self.eos_token_id]
+
+    def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
+        """Reset the special tokens to the target language setting. prefix=[tgt_lang_code] and suffix=[eos]."""
+        self.cur_lang_code_id = self.lang_code_to_id[tgt_lang]
+        self.prefix_tokens = [self.cur_lang_code_id]
+        self.suffix_tokens = [self.eos_token_id]
+
+
+__all__ = ["MBart50Tokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/tokenization_mbart50_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/tokenization_mbart50_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..985b0929f87c5516a2edd8e02c6aaaa8926d496e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/tokenization_mbart50_fast.py
@@ -0,0 +1,258 @@
+# coding=utf-8
+# Copyright 2021 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from shutil import copyfile
+from typing import Optional
+
+from tokenizers import processors
+
+from ...tokenization_utils import AddedToken, BatchEncoding
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import is_sentencepiece_available, logging
+
+
+if is_sentencepiece_available():
+    from .tokenization_mbart50 import MBart50Tokenizer
+else:
+    MBart50Tokenizer = None
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
+
+
+FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"]  # fmt: skip
+
+
+class MBart50TokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" MBART tokenizer for mBART-50 (backed by HuggingFace's *tokenizers* library). Based on
+    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        src_lang (`str`, *optional*):
+            A string representing the source language.
+        tgt_lang (`str`, *optional*):
+            A string representing the target language.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+
+    Examples:
+
+    ```python
+    >>> from transformers import MBart50TokenizerFast
+
+    >>> tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
+    >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
+    >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
+    >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
+    >>> # model(**model_inputs) should work
+    ```"""
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = MBart50Tokenizer
+
+    prefix_tokens: list[int] = []
+    suffix_tokens: list[int] = []
+
+    def __init__(
+        self,
+        vocab_file=None,
+        src_lang=None,
+        tgt_lang=None,
+        tokenizer_file=None,
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        **kwargs,
+    ):
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
+        kwargs["additional_special_tokens"] += [
+            code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
+        ]
+
+        super().__init__(
+            vocab_file,
+            src_lang=src_lang,
+            tgt_lang=tgt_lang,
+            tokenizer_file=tokenizer_file,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+
+        self.lang_code_to_id = {
+            lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
+        }
+
+        self._src_lang = src_lang if src_lang is not None else "en_XX"
+        self.tgt_lang = tgt_lang
+        self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
+        self.set_src_lang_special_tokens(self._src_lang)
+
+    @property
+    def src_lang(self) -> str:
+        return self._src_lang
+
+    @src_lang.setter
+    def src_lang(self, new_src_lang: str) -> None:
+        self._src_lang = new_src_lang
+        self.set_src_lang_special_tokens(self._src_lang)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. The special tokens depend on calling set_lang.
+
+        An MBART-50 sequence has the following format, where `X` represents the sequence:
+
+        - `input_ids` (for encoder) `[src_lang_code] X [eos]`
+        - `labels`: (for decoder) `[tgt_lang_code] X [eos]`
+
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: list[str],
+        src_lang: str = "en_XX",
+        tgt_texts: Optional[list[str]] = None,
+        tgt_lang: str = "ro_RO",
+        **kwargs,
+    ) -> BatchEncoding:
+        self.src_lang = src_lang
+        self.tgt_lang = tgt_lang
+        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
+
+    def _switch_to_input_mode(self):
+        return self.set_src_lang_special_tokens(self.src_lang)
+
+    def _switch_to_target_mode(self):
+        return self.set_tgt_lang_special_tokens(self.tgt_lang)
+
+    def set_src_lang_special_tokens(self, src_lang: str) -> None:
+        """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos]."""
+        self.cur_lang_code_id = self.convert_tokens_to_ids(src_lang)
+        self.prefix_tokens = [self.cur_lang_code_id]
+        self.suffix_tokens = [self.eos_token_id]
+
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+
+    def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
+        """Reset the special tokens to the target language setting. prefix=[src_lang_code] and suffix=[eos]."""
+        self.cur_lang_code_id = self.convert_tokens_to_ids(tgt_lang)
+        self.prefix_tokens = [self.cur_lang_code_id]
+        self.suffix_tokens = [self.eos_token_id]
+
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+
+    def _build_translation_inputs(
+        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
+    ):
+        """Used by translation pipeline, to prepare inputs for the generate function"""
+        if src_lang is None or tgt_lang is None:
+            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
+        self.src_lang = src_lang
+        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
+        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
+        inputs["forced_bos_token_id"] = tgt_lang_id
+        return inputs
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+
+__all__ = ["MBart50TokenizerFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..609613a79e3e6112d362af8bb834f6b550600479
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/configuration_megatron_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/configuration_megatron_bert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5cb62459b99bb4bd04084bb7a400f4222f5ac884
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/configuration_megatron_bert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/modeling_megatron_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/modeling_megatron_bert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44591bfa4af9c576217d322de11bc1f4407028cb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_bert/__pycache__/modeling_megatron_bert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed3361fee50df163f3021464871bb74457a4f0d4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/configuration_metaclip_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/configuration_metaclip_2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9434ee35e80e1a142df1a28dff1ed0b9db8e3a3e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/configuration_metaclip_2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/modeling_metaclip_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/modeling_metaclip_2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75074f06a243e15c20c13311028c035400749c52
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/modeling_metaclip_2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/modular_metaclip_2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/modular_metaclip_2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e113347c1e24257bbb23d95fd8495d8321a108f7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/metaclip_2/__pycache__/modular_metaclip_2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19c9a737ea368bdaf5382158390a04630d8d044f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/configuration_mimi.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/configuration_mimi.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08e86af2d2711e5ce887b68be3684881cfdc3093
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/configuration_mimi.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/modeling_mimi.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/modeling_mimi.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92b950df843d1f09bf3458cf6dc76a598bd309cc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mimi/__pycache__/modeling_mimi.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/modeling_ministral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/modeling_ministral.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a474d618d79e0477d06223b40b135ab1dc8312c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/modeling_ministral.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..18a5657cd2ec6dbf39c6e794fae09bf65753da8f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_mistral import *
+    from .modeling_flax_mistral import *
+    from .modeling_mistral import *
+    from .modeling_tf_mistral import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/configuration_mistral.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/configuration_mistral.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9f66b1a2fbe9117108b949107cd42246a7760b5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/configuration_mistral.py
@@ -0,0 +1,174 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mistral model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MistralConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
+    Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
+
+    [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
+    [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MistralModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
+        head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
+            The attention head dimension.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Mistral's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention window size. If not specified, will default to `4096`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import MistralModel, MistralConfig
+
+    >>> # Initializing a Mistral 7B style configuration
+    >>> configuration = MistralConfig()
+
+    >>> # Initializing a model from the Mistral 7B style configuration
+    >>> model = MistralModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mistral"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `MistralModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        head_dim=None,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        sliding_window=4096,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.head_dim = head_dim
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        if "layer_types" in kwargs:
+            logger.warning_once(
+                "Detected Mistral model with layer_types. Consider using AutoModel or Ministral classes instead to enable alternating attention compatibility."
+            )
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["MistralConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_flax_mistral.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_flax_mistral.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c084ee114d762de16d11710cb05ec49d41e3676
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_flax_mistral.py
@@ -0,0 +1,744 @@
+# coding=utf-8
+# Copyright 2024 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flax Mistral model."""
+
+from typing import Optional
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutput,
+    FlaxBaseModelOutputWithPast,
+    FlaxCausalLMOutput,
+    FlaxCausalLMOutputWithCrossAttentions,
+)
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, logging
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward
+from .configuration_mistral import MistralConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MistralConfig"
+_REAL_CHECKPOINT_FOR_DOC = "mistralai/Mistral-7B-v0.1"
+_CHECKPOINT_FOR_DOC = "ksmcg/Mistral-tiny"
+
+MISTRAL_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`MistralConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16`, or
+            `jax.numpy.bfloat16`.
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+MISTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaRMSNorm with Llama->Mistral
+class FlaxMistralRMSNorm(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.epsilon = self.config.rms_norm_eps
+        self.weight = self.param("weight", lambda _, shape: jnp.ones(shape), self.config.hidden_size)
+
+    def __call__(self, hidden_states):
+        variance = jnp.asarray(hidden_states, dtype=jnp.float32)
+        variance = jnp.power(variance, 2)
+        variance = variance.mean(-1, keepdims=True)
+        # use `jax.numpy.sqrt` as `jax.lax.rsqrt` does not match `torch.rsqrt`
+        hidden_states = hidden_states / jnp.sqrt(variance + self.epsilon)
+
+        return self.weight * jnp.asarray(hidden_states, dtype=self.dtype)
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaRotaryEmbedding with Llama->Mistral
+class FlaxMistralRotaryEmbedding(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        head_dim = self.config.hidden_size // self.config.num_attention_heads
+        self.sincos = create_sinusoidal_positions(self.config.max_position_embeddings, head_dim)
+
+    def __call__(self, key, query, position_ids):
+        sincos = self.sincos[position_ids]
+        sin_pos, cos_pos = jnp.split(sincos, 2, axis=-1)
+
+        key = apply_rotary_pos_emb(key, sin_pos, cos_pos)
+        query = apply_rotary_pos_emb(query, sin_pos, cos_pos)
+
+        key = jnp.asarray(key, dtype=self.dtype)
+        query = jnp.asarray(query, dtype=self.dtype)
+
+        return key, query
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaMLP with Llama->Mistral
+class FlaxMistralMLP(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        embed_dim = self.config.hidden_size
+        inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * embed_dim
+
+        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
+        self.act = ACT2FN[self.config.hidden_act]
+
+        self.gate_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+        self.down_proj = nn.Dense(embed_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+        self.up_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+
+    def __call__(self, hidden_states):
+        up_proj_states = self.up_proj(hidden_states)
+        gate_states = self.act(self.gate_proj(hidden_states))
+
+        hidden_states = self.down_proj(up_proj_states * gate_states)
+        return hidden_states
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(tensor, sin_pos, cos_pos):
+    return (tensor * cos_pos) + (rotate_half(tensor) * sin_pos)
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.create_sinusoidal_positions
+def create_sinusoidal_positions(num_pos, dim):
+    inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim))
+    freqs = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32")
+
+    emb = np.concatenate((freqs, freqs), axis=-1)
+    out = np.concatenate((np.sin(emb)[:, None, :], np.cos(emb)[:, None, :]), axis=-1)
+    return jnp.array(out[:, :, :num_pos])
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.rotate_half
+def rotate_half(tensor):
+    """Rotates half the hidden dims of the input."""
+    rotate_half_tensor = jnp.concatenate(
+        (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1
+    )
+    return rotate_half_tensor
+
+
+class FlaxMistralAttention(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        config = self.config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.attention_softmax_in_fp32 = self.dtype is not jnp.float32
+        self.rope_theta = config.rope_theta
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Dense(self.num_heads * self.head_dim, use_bias=False, dtype=self.dtype)
+        self.k_proj = nn.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, dtype=self.dtype)
+        self.v_proj = nn.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, dtype=self.dtype)
+        self.o_proj = nn.Dense(self.hidden_size, use_bias=False, dtype=self.dtype)
+        causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
+        self.causal_mask = jnp.triu(causal_mask, k=-(config.sliding_window or 0))
+        self.rotary_emb = FlaxMistralRotaryEmbedding(self.config, dtype=self.dtype)
+
+    def _split_heads(self, hidden_states, num_heads):
+        return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.hidden_size,))
+
+    @nn.compact
+    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoSelfAttention._concatenate_to_cache
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slightly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        init_cache: bool = False,
+    ) -> tuple[jnp.ndarray, jnp.ndarray]:
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = self._split_heads(query_states, self.num_heads)
+        key_states = self._split_heads(key_states, self.num_key_value_heads)
+        value_states = self._split_heads(value_states, self.num_key_value_heads)
+
+        key_states, query_states = self.rotary_emb(key_states, query_states, position_ids)
+        query_length, key_length = query_states.shape[1], key_states.shape[1]
+        if self.has_variable("cache", "cached_key"):
+            mask_shift = self.variables["cache"]["cache_index"]
+            max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+            causal_mask = lax.dynamic_slice(
+                self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+            )
+        else:
+            causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+
+        batch_size = hidden_states.shape[0]
+        causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+        attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+        attention_mask = combine_masks(attention_mask, causal_mask)
+
+        if self.has_variable("cache", "cached_key") or init_cache:
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+        key_states = jnp.repeat(key_states, self.num_key_value_groups, axis=2)
+        value_states = jnp.repeat(value_states, self.num_key_value_groups, axis=2)
+
+        attention_bias = lax.select(
+            attention_mask > 0,
+            jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+            jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+        )
+
+        # usual dot product attention
+        attention_dtype = jnp.float32 if self.attention_softmax_in_fp32 else self.dtype
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            deterministic=deterministic,
+            dropout_rate=self.config.attention_dropout,
+            dtype=attention_dtype,
+        )
+
+        if self.attention_softmax_in_fp32:
+            attn_weights = attn_weights.astype(self.dtype)
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.o_proj(attn_output)
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaDecoderLayer with Llama->Mistral
+class FlaxMistralDecoderLayer(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.input_layernorm = FlaxMistralRMSNorm(self.config, dtype=self.dtype)
+        self.self_attn = FlaxMistralAttention(self.config, dtype=self.dtype)
+        self.post_attention_layernorm = FlaxMistralRMSNorm(self.config, dtype=self.dtype)
+        self.mlp = FlaxMistralMLP(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        outputs = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+        )
+        # residual connection
+        attn_output = outputs[0]
+        hidden_states = residual + attn_output
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + hidden_states
+
+        return (hidden_states,) + outputs[1:]
+
+
+# Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel with GPTNeo->Mistral, GPT_NEO->MISTRAL, transformer->model
+class FlaxMistralPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MistralConfig
+    base_model_prefix = "model"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: MistralConfig,
+        input_shape: tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length))
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        params: Optional[dict] = None,
+        past_key_values: Optional[dict] = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        batch_size, sequence_length = input_ids.shape
+
+        if position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
+
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        if attention_mask is None:
+            attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxMistralAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        outputs = self.module.apply(
+            inputs,
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            jnp.array(position_ids, dtype="i4"),
+            not train,
+            False,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=mutable,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past_key_values = outputs
+            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past_key_values = outputs
+            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        return outputs
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaLayerCollection with Llama->Mistral
+class FlaxMistralLayerCollection(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.blocks = [
+            FlaxMistralDecoderLayer(self.config, dtype=self.dtype, name=str(i))
+            for i in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = False,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for block in self.blocks:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = block(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                deterministic=deterministic,
+                init_cache=init_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+        # this contains possible `None` values - `FlaxMistralModule` will filter them out
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+
+        return outputs
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaModule with Llama->Mistral
+class FlaxMistralModule(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.hidden_size = self.config.hidden_size
+        embedding_init = jax.nn.initializers.normal(stddev=self.config.initializer_range)
+        self.embed_tokens = nn.Embed(
+            self.config.vocab_size,
+            self.hidden_size,
+            embedding_init=embedding_init,
+            dtype=self.dtype,
+        )
+        self.layers = FlaxMistralLayerCollection(self.config, dtype=self.dtype)
+        self.norm = FlaxMistralRMSNorm(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        deterministic=True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        input_embeds = self.embed_tokens(input_ids.astype("i4"))
+
+        outputs = self.layers(
+            input_embeds,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = outputs[1] + (hidden_states,)
+            outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=outputs[1],
+            attentions=outputs[-1],
+        )
+
+
+@add_start_docstrings(
+    "The bare Mistral Model transformer outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class FlaxMistralModel(FlaxMistralPreTrainedModel):
+    module_class = FlaxMistralModule
+
+
+append_call_sample_docstring(
+    FlaxMistralModel,
+    _CHECKPOINT_FOR_DOC,
+    FlaxBaseModelOutputWithPast,
+    _CONFIG_FOR_DOC,
+    real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
+)
+
+
+# Copied from transformers.models.llama.modeling_flax_llama.FlaxLlamaForCausalLMModule with Llama->Mistral
+class FlaxMistralForCausalLMModule(nn.Module):
+    config: MistralConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.model = FlaxMistralModule(self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        outputs = self.model(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        lm_logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
+
+        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """
+    The Mistral Model transformer with a language modeling head (linear layer) on top.
+    """,
+    MISTRAL_START_DOCSTRING,
+)
+
+# Copied from transformers.models.gptj.modeling_flax_gptj.FlaxGPTJForCausalLM with GPTJ->Mistral
+class FlaxMistralForCausalLM(FlaxMistralPreTrainedModel):
+    module_class = FlaxMistralForCausalLMModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since Mistral uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+append_call_sample_docstring(
+    FlaxMistralForCausalLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutputWithCrossAttentions,
+    _CONFIG_FOR_DOC,
+    real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
+)
+
+__all__ = ["FlaxMistralForCausalLM", "FlaxMistralModel", "FlaxMistralPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_mistral.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_mistral.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b7c7b2c17900641f6b7d540fbec1ee2faf59c33
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_mistral.py
@@ -0,0 +1,480 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/mistral/modular_mistral.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_mistral.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from transformers.utils.generic import check_model_inputs
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import (
+    GenericForQuestionAnswering,
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_mistral import MistralConfig
+
+
+class MistralMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class MistralAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: MistralConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=getattr(self.config, "sliding_window", None),  # main diff with Llama
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class MistralRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MistralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class MistralDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: MistralConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = MistralAttention(config=config, layer_idx=layer_idx)
+        self.mlp = MistralMLP(config)
+        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class MistralPreTrainedModel(PreTrainedModel):
+    config: MistralConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MistralDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": MistralDecoderLayer,
+        "attentions": MistralAttention,
+    }
+
+
+class MistralRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: MistralConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class MistralModel(MistralPreTrainedModel):
+    def __init__(self, config: MistralConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = MistralRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        mask_function = create_causal_mask if self.config.sliding_window is None else create_sliding_window_causal_mask
+        causal_mask = mask_function(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+
+
+@auto_docstring
+class MistralForCausalLM(MistralPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = MistralModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MistralForCausalLM
+
+        >>> model = MistralForCausalLM.from_pretrained("meta-mistral/Mistral-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-mistral/Mistral-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class MistralForTokenClassification(GenericForTokenClassification, MistralPreTrainedModel):
+    pass
+
+
+class MistralForSequenceClassification(GenericForSequenceClassification, MistralPreTrainedModel):
+    pass
+
+
+class MistralForQuestionAnswering(GenericForQuestionAnswering, MistralPreTrainedModel): ...
+
+
+__all__ = [
+    "MistralForCausalLM",
+    "MistralForQuestionAnswering",
+    "MistralModel",
+    "MistralPreTrainedModel",
+    "MistralForSequenceClassification",
+    "MistralForTokenClassification",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_tf_mistral.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_tf_mistral.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3ca7d13b6a88f27fe1957510da75b84865ae2dd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modeling_tf_mistral.py
@@ -0,0 +1,1016 @@
+# coding=utf-8
+# Copyright 2024 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0  Mistral model."""
+
+import math
+import warnings
+from typing import Optional, Union
+
+import tensorflow as tf
+
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithPast,
+    TFCausalLMOutputWithPast,
+    TFSequenceClassifierOutputWithPast,
+)
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    get_initializer,
+    get_tf_activation,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_mistral import MistralConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MistralConfig"
+
+
+def _make_causal_mask(input_ids_shape, dtype, past_key_values_length=0):
+    """
+    Make causal mask used for bi-directional self-attention, supporting both static and dynamic shapes.
+    """
+    bsz, tgt_len = input_ids_shape
+
+    # Create a matrix where only the lower triangle and diagonal are filled with zeros (causal mask)
+    mask = tf.fill((tgt_len, tgt_len), tf.dtypes.as_dtype(dtype).min)
+    mask_cond = tf.range(tgt_len)
+    mask = tf.where(mask_cond[:, None] >= mask_cond[None, :], 0.0, mask)
+
+    if past_key_values_length > 0:
+        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length), dtype=dtype), mask], axis=-1)
+
+    if bsz is None:
+        # When batch size is dynamic, expand and tile
+        # so we can compile a functional model
+        mask = tf.expand_dims(mask, 0)
+        mask = tf.expand_dims(mask, 0)  # shape: (1, 1, tgt_len, tgt_len + past_key_values_length)
+        mask = tf.tile(mask, [bsz, 1, 1, 1])
+    else:
+        # When batch size is static, directly use broadcast_to
+        mask = tf.broadcast_to(mask[None, None, :, :], (bsz, 1, tgt_len, tgt_len + past_key_values_length))
+
+    return mask
+
+
+def _expand_mask(mask, dtype, tgt_len=None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = shape_list(mask)
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = tf.expand_dims(tf.expand_dims(mask, 1), 1)
+    expanded_mask = tf.broadcast_to(expanded_mask, [bsz, 1, tgt_len, src_len])
+
+    inverted_mask = 1.0 - tf.cast(expanded_mask, dtype)
+
+    return tf.where(
+        tf.cast(inverted_mask, bool), tf.fill(dims=shape_list(inverted_mask), value=tf.float32.min), inverted_mask
+    )
+
+
+class TFMistralRMSNorm(keras.layers.Layer):
+    def __init__(self, hidden_size, eps=1e-6, **kwargs):
+        """
+        TFMistralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.variance_epsilon = eps
+
+    def build(self, input_shape=None):
+        self.weight = self.add_weight(
+            name="weight",
+            shape=self.hidden_size,
+            initializer="ones",
+        )
+        if self.built:
+            return
+        self.built = True
+
+    def call(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = tf.cast(hidden_states, tf.float32)
+        variance = tf.reduce_mean(tf.square(hidden_states), axis=-1, keepdims=True)
+        hidden_states = tf.divide(hidden_states, tf.sqrt(variance + self.variance_epsilon))
+        return self.weight * tf.cast(hidden_states, input_dtype)
+
+
+# Verification: https://colab.research.google.com/gist/ariG23498/f8d8131b795a131b93d99e70ee93c192/scratchpad.ipynb
+class TFMistralRotaryEmbedding(keras.layers.Layer):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, **kwargs):
+        super().__init__(**kwargs)
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.inv_freq = 1.0 / (self.base ** (tf.range(start=0, limit=self.dim, delta=2, dtype=tf.float32) / self.dim))
+
+    def call(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        t = tf.cast(tf.range(seq_len, dtype=tf.int64), self.inv_freq.dtype)
+        freqs = tf.einsum("i,j->ij", t, self.inv_freq)
+        emb = tf.concat([freqs, freqs], axis=-1)
+        cos_values = tf.cast(tf.cos(emb), x.dtype)
+        sin_values = tf.cast(tf.sin(emb), x.dtype)
+
+        cos_values = cos_values[:seq_len]
+        cos_values = tf.cast(cos_values, dtype=x.dtype)
+        sin_values = sin_values[:seq_len]
+        sin_values = tf.cast(sin_values, dtype=x.dtype)
+        return (cos_values, sin_values)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    mid_length = shape_list(x)[-1] // 2
+    x1 = x[..., :mid_length]
+    x2 = x[..., mid_length:]
+    return tf.concat([-x2, x1], axis=-1)
+
+
+# Verification: https://colab.research.google.com/gist/ariG23498/bb8474baeb33f4ae6ed7d77da5f7e7a4/scratchpad.ipynb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`tf.Tensor`): The query tensor.
+        k (`tf.Tensor`): The key tensor.
+        cos (`tf.Tensor`): The cosine part of the rotary embedding.
+        sin (`tf.Tensor`): The sine part of the rotary embedding.
+        position_ids (`tf.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(tf.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = tf.expand_dims(tf.gather(cos, position_ids), unsqueeze_dim)
+    sin = tf.expand_dims(tf.gather(sin, position_ids), unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class TFMistralMLP(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = keras.layers.Dense(self.intermediate_size, use_bias=False, name="gate_proj")
+        self.up_proj = keras.layers.Dense(self.intermediate_size, use_bias=False, name="up_proj")
+        self.down_proj = keras.layers.Dense(self.hidden_size, use_bias=False, name="down_proj")
+        self.act_fn = get_tf_activation(config.hidden_act)
+
+    def call(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "gate_proj", None) is not None:
+            with tf.name_scope(self.gate_proj.name):
+                self.gate_proj.build((self.hidden_size,))
+        if getattr(self, "up_proj", None) is not None:
+            with tf.name_scope(self.up_proj.name):
+                self.up_proj.build((self.hidden_size,))
+        if getattr(self, "down_proj", None) is not None:
+            with tf.name_scope(self.down_proj.name):
+                self.down_proj.build((self.intermediate_size,))
+
+
+# Verification: https://colab.research.google.com/gist/ariG23498/556d443d491966763ce2e7eee336efed/scratchpad.ipynb
+def repeat_kv(hidden_states: tf.Tensor, n_rep: int) -> tf.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = shape_list(hidden_states)
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = tf.expand_dims(hidden_states, 2)
+    hidden_states = tf.repeat(hidden_states, repeats=n_rep, axis=2)
+    return tf.reshape(hidden_states, (batch, num_key_value_heads * n_rep, slen, head_dim))
+
+
+class TFMistralAttention(keras.layers.Layer):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: MistralConfig, layer_idx: Optional[int] = None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = keras.layers.Dense(self.num_heads * self.head_dim, use_bias=False, name="q_proj")
+        self.k_proj = keras.layers.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, name="k_proj")
+        self.v_proj = keras.layers.Dense(self.num_key_value_heads * self.head_dim, use_bias=False, name="v_proj")
+        self.o_proj = keras.layers.Dense(self.hidden_size, use_bias=False, name="o_proj")
+
+        self.rotary_emb = TFMistralRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+            name="rotary_emb",
+        )
+        self.dropout = keras.layers.Dropout(rate=self.attention_dropout)
+
+    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
+        tensor = tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim))
+        tensor = tf.transpose(tensor, perm=(0, 2, 1, 3))
+        return tensor
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        past_key_value: Optional[tuple[tf.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        training=None,
+        **kwargs,
+    ) -> tuple[tf.Tensor, Optional[tf.Tensor], Optional[tuple[tf.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = shape_list(hidden_states)
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = tf.transpose(
+            tf.reshape(query_states, (bsz, q_len, self.num_heads, self.head_dim)), perm=(0, 2, 1, 3)
+        )
+        key_states = tf.transpose(
+            tf.reshape(key_states, (bsz, q_len, self.num_key_value_heads, self.head_dim)), perm=(0, 2, 1, 3)
+        )
+        value_states = tf.transpose(
+            tf.reshape(value_states, (bsz, q_len, self.num_key_value_heads, self.head_dim)), perm=(0, 2, 1, 3)
+        )
+
+        kv_seq_len = shape_list(key_states)[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(
+            x=value_states,
+            seq_len=kv_seq_len,
+        )
+        query_states, key_states = apply_rotary_pos_emb(
+            q=query_states,
+            k=key_states,
+            cos=cos,
+            sin=sin,
+            position_ids=position_ids,
+        )
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = tf.concat([past_key_value[0], key_states], axis=2)
+            value_states = tf.concat([past_key_value[1], value_states], axis=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = tf.matmul(query_states, key_states, transpose_b=True) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = stable_softmax(attn_weights, axis=-1)
+        attn_weights = tf.cast(attn_weights, query_states.dtype)
+        attn_weights = self.dropout(
+            attn_weights,
+            training=training,
+        )
+        attn_output = tf.matmul(attn_weights, value_states)
+
+        attn_output = tf.transpose(attn_output, perm=(0, 2, 1, 3))
+        attn_output = tf.reshape(attn_output, (bsz, q_len, self.hidden_size))
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build((self.hidden_size,))
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build((self.hidden_size,))
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build((self.hidden_size,))
+        if getattr(self, "o_proj", None) is not None:
+            with tf.name_scope(self.o_proj.name):
+                self.o_proj.build((self.num_heads * self.head_dim,))
+
+
+class TFMistralDecoderLayer(keras.layers.Layer):
+    def __init__(self, config: MistralConfig, layer_idx: int, **kwargs):
+        super().__init__(**kwargs)
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = TFMistralAttention(config, layer_idx, name="self_attn")
+
+        self.mlp = TFMistralMLP(config, name="mlp")
+        self.input_layernorm = TFMistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps, name="input_layernorm")
+        self.post_attention_layernorm = TFMistralRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, name="post_attention_layernorm"
+        )
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        past_key_value: Optional[tuple[tf.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> tuple[tf.Tensor, Optional[tuple[tf.Tensor, tf.Tensor]]]:
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`tf.Tensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(tf.Tensor)`, *optional*): cached past key and value projection states
+        """
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+        if getattr(self, "input_layernorm", None) is not None:
+            with tf.name_scope(self.input_layernorm.name):
+                self.input_layernorm.build(None)
+        if getattr(self, "post_attention_layernorm", None) is not None:
+            with tf.name_scope(self.post_attention_layernorm.name):
+                self.post_attention_layernorm.build(None)
+
+
+@keras_serializable
+class TFMistralMainLayer(keras.layers.Layer):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
+
+    Args:
+        config: MistralConfig
+    """
+
+    config_class = MistralConfig
+
+    def __init__(self, config: MistralConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+
+        # TF and PT Embedding check: https://colab.research.google.com/gist/ariG23498/2b9826818875c9c4968c79cb19f55f2c/scratchpad.ipynb
+        self.embed_tokens = keras.layers.Embedding(
+            input_dim=config.vocab_size,
+            output_dim=config.hidden_size,
+            name="embed_tokens",
+        )
+        self.layers = [
+            TFMistralDecoderLayer(config, layer_idx, name=f"layers.{layer_idx}")
+            for layer_idx in range(config.num_hidden_layers)
+        ]
+        self._attn_implementation = config._attn_implementation
+        self.norm = TFMistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps, name="norm")
+        self.config = config
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        # if input_shape[-1] > 1:
+        combined_attention_mask = _make_causal_mask(
+            input_shape,
+            inputs_embeds.dtype,
+            past_key_values_length=past_key_values_length,
+        )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: Optional[tf.Tensor] = None,
+        attention_mask: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        past_key_values: Optional[list[tf.Tensor]] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TFBaseModelOutputWithPast]:
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = shape_list(inputs_embeds)
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = shape_list(past_key_values[0][0])[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            position_ids = tf.range(
+                start=past_key_values_length, limit=seq_length + past_key_values_length, dtype=tf.int64
+            )
+            position_ids = tf.reshape(tf.expand_dims(position_ids, 0), (-1, seq_length))
+
+        else:
+            position_ids = tf.cast(tf.reshape(position_ids, (-1, seq_length)), tf.int64)
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if attention_mask is None:
+            attention_mask = tf.ones((batch_size, seq_length_with_past), dtype=tf.bool)
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return TFBaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embed_tokens", None) is not None:
+            with tf.name_scope(self.embed_tokens.name):
+                self.embed_tokens.build(None)
+        if getattr(self, "norm", None) is not None:
+            with tf.name_scope(self.norm.name):
+                self.norm.build(None)
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+MISTRAL_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `model` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`MistralConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class TFMistralPreTrainedModel(TFPreTrainedModel):
+    config_class = MistralConfig
+    base_model_prefix = "model"
+
+
+MISTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(tf.Tensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            One formats is allowed:
+            - Tuple of `tuple(tf.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mistral Model outputting raw hidden-states without any specific head on top.",
+    MISTRAL_START_DOCSTRING,
+)
+class TFMistralModel(TFMistralPreTrainedModel):
+    def __init__(self, config: MistralConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.model = TFMistralMainLayer(config, name="model")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids: Optional[tf.Tensor] = None,
+        attention_mask: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        past_key_values: Optional[list[tf.Tensor]] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TFBaseModelOutputWithPast]:
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+
+
+class TFMistralForCausalLM(TFMistralPreTrainedModel, TFCausalLanguageModelingLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.model = TFMistralMainLayer(config, name="model")
+        self.vocab_size = config.vocab_size
+        self.lm_head = keras.layers.Dense(
+            config.vocab_size,
+            use_bias=False,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="lm_head",
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def call(
+        self,
+        input_ids: Optional[tf.Tensor] = None,
+        attention_mask: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        past_key_values: Optional[list[tf.Tensor]] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
+        labels: Optional[tf.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TFCausalLMOutputWithPast]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
+            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = tf.cast(logits, tf.float32)
+
+        loss = None
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            shifted_logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.hf_compute_loss(labels, shifted_logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values:
+            input_ids = tf.expand_dims(input_ids[:, -1], -1)
+
+        position_ids = kwargs.get("position_ids")
+        if attention_mask is not None and position_ids is None:
+            position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
+            if past_key_values:
+                position_ids = tf.expand_dims(position_ids[:, -1], -1)
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+        }
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build((self.config.hidden_size,))
+
+
+@add_start_docstrings(
+    """
+    The Mistral Model transformer with a sequence classification head on top (linear layer).
+
+    [`MistralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MISTRAL_START_DOCSTRING,
+)
+class TFMistralForSequenceClassification(TFMistralPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.model = TFMistralMainLayer(config, name="model")
+        self.score = keras.layers.Dense(
+            self.num_labels,
+            use_bias=False,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="score",
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def call(
+        self,
+        input_ids: Optional[tf.Tensor] = None,
+        attention_mask: Optional[tf.Tensor] = None,
+        position_ids: Optional[tf.Tensor] = None,
+        past_key_values: Optional[list[tf.Tensor]] = None,
+        inputs_embeds: Optional[tf.Tensor] = None,
+        labels: Optional[tf.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TFSequenceClassifierOutputWithPast]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+
+        transformer_outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        logits_shape = shape_list(logits)
+        batch_size = logits_shape[0]
+
+        if self.config.pad_token_id is None:
+            last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1)
+        else:
+            if input_ids is not None:
+                token_indices = tf.range(shape_list(input_ids)[-1])
+                non_pad_mask = tf.cast(input_ids != self.config.pad_token_id, token_indices.dtype)
+                last_non_pad_token = tf.reduce_max(token_indices * non_pad_mask, axis=-1)
+            else:
+                last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1)
+                logger.warning_once(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+        loss = None
+
+        pooled_logits = tf.gather(logits, last_non_pad_token, batch_dims=1, axis=1)
+
+        if labels is not None:
+            if self.config.pad_token_id is None and logits_shape[0] != 1:
+                raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+
+            loss = self.hf_compute_loss(tf.reshape(labels, [-1]), tf.reshape(pooled_logits, [-1, self.num_labels]))
+
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+        if getattr(self, "score", None) is not None:
+            with tf.name_scope(self.score.name):
+                self.score.build((self.config.hidden_size,))
+
+
+__all__ = ["TFMistralModel", "TFMistralForCausalLM", "TFMistralForSequenceClassification", "TFMistralPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modular_mistral.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modular_mistral.py
new file mode 100644
index 0000000000000000000000000000000000000000..290d60b91e66ccb1bb9aaf853b521cc56f20e79a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mistral/modular_mistral.py
@@ -0,0 +1,199 @@
+from typing import Callable, Optional
+
+import torch
+from torch import nn
+
+from transformers.utils.generic import check_model_inputs
+
+from ...cache_utils import Cache, DynamicCache
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import (
+    GenericForQuestionAnswering,
+)
+from ...modeling_outputs import BaseModelOutputWithPast
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, logging
+from ...utils.deprecation import deprecate_kwarg
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaForSequenceClassification,
+    LlamaForTokenClassification,
+    LlamaMLP,
+    LlamaModel,
+    LlamaPreTrainedModel,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+from .configuration_mistral import MistralConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class MistralMLP(LlamaMLP):
+    def __init__(self, config):
+        super().__init__(config)
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+
+
+class MistralAttention(LlamaAttention):
+    def __init__(self, config: MistralConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=getattr(self.config, "sliding_window", None),  # main diff with Llama
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class MistralDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: MistralConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.self_attn = MistralAttention(config=config, layer_idx=layer_idx)
+        self.mlp = MistralMLP(config)
+
+
+class MistralPreTrainedModel(LlamaPreTrainedModel):
+    _can_record_outputs = {
+        "hidden_states": MistralDecoderLayer,
+        "attentions": MistralAttention,
+    }
+
+
+class MistralModel(LlamaModel):
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        mask_function = create_causal_mask if self.config.sliding_window is None else create_sliding_window_causal_mask
+        causal_mask = mask_function(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+
+
+class MistralForCausalLM(LlamaForCausalLM):
+    pass
+
+
+class MistralForTokenClassification(LlamaForTokenClassification):
+    pass
+
+
+class MistralForSequenceClassification(LlamaForSequenceClassification):
+    pass
+
+
+class MistralForQuestionAnswering(GenericForQuestionAnswering, MistralPreTrainedModel): ...
+
+
+__all__ = [
+    "MistralForCausalLM",
+    "MistralForQuestionAnswering",
+    "MistralModel",
+    "MistralPreTrainedModel",
+    "MistralForSequenceClassification",
+    "MistralForTokenClassification",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91c0a48b8b8c906c4e1c9cbde3afe4130d233197
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/configuration_mixtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/configuration_mixtral.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c083c1097e44e4a0608023786da340e0d2bd3f29
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/configuration_mixtral.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/modeling_mixtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/modeling_mixtral.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5e57a37895b1d29ca6a9cb19938d3fd13cbf01d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/modeling_mixtral.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/modular_mixtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/modular_mixtral.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca95a9fe7bf3535f0530a5707c1cf81d448af8bd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mixtral/__pycache__/modular_mixtral.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1efc6527fcbd523aaca30c08c6ee445adc69cb13
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/configuration_mm_grounding_dino.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/configuration_mm_grounding_dino.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b70d697edabea8737834521fed9920f4854c2b8e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/configuration_mm_grounding_dino.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/modular_mm_grounding_dino.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/modular_mm_grounding_dino.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3bd826b44a117b608db36dac46d1cefdc53eae44
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/__pycache__/modular_mm_grounding_dino.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3d498323de4dc3b60dcf8c3c8ac02f25d8c3ed1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py
@@ -0,0 +1,2605 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_mm_grounding_dino.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...file_utils import ModelOutput, is_timm_available, requires_backends
+from ...integrations import use_kernel_forward_from_hub
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import meshgrid
+from ...utils import auto_docstring
+from ...utils.backbone_utils import load_backbone
+from ..auto.modeling_auto import AutoModel
+from .configuration_mm_grounding_dino import MMGroundingDinoConfig
+
+
+if is_timm_available():
+    from timm import create_model
+
+
+class MMGroundingDinoContrastiveEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.max_text_len = config.max_text_len
+        self.bias = nn.Parameter(torch.tensor(0.0))
+
+    def forward(
+        self,
+        vision_hidden_state: torch.FloatTensor,
+        text_hidden_state: torch.FloatTensor,
+        text_token_mask: torch.BoolTensor,
+    ) -> torch.FloatTensor:
+        res = vision_hidden_state @ text_hidden_state.transpose(-1, -2)
+        res = res / math.sqrt(vision_hidden_state.shape[-1])
+        res = res + self.bias
+        res.masked_fill_(~text_token_mask[:, None, :], float("-inf"))
+
+        # padding to max_text_len
+        new_res = torch.full((*res.shape[:-1], self.max_text_len), float("-inf"), device=res.device)
+        new_res[..., : res.shape[-1]] = res
+
+        return new_res
+
+
+@use_kernel_forward_from_hub("MultiScaleDeformableAttention")
+class MultiScaleDeformableAttention(nn.Module):
+    def forward(
+        self,
+        value: Tensor,
+        value_spatial_shapes: Tensor,
+        value_spatial_shapes_list: list[tuple],
+        level_start_index: Tensor,
+        sampling_locations: Tensor,
+        attention_weights: Tensor,
+        im2col_step: int,
+    ):
+        batch_size, _, num_heads, hidden_dim = value.shape
+        _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+        value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1)
+        sampling_grids = 2 * sampling_locations - 1
+        sampling_value_list = []
+        for level_id, (height, width) in enumerate(value_spatial_shapes_list):
+            # batch_size, height*width, num_heads, hidden_dim
+            # -> batch_size, height*width, num_heads*hidden_dim
+            # -> batch_size, num_heads*hidden_dim, height*width
+            # -> batch_size*num_heads, hidden_dim, height, width
+            value_l_ = (
+                value_list[level_id]
+                .flatten(2)
+                .transpose(1, 2)
+                .reshape(batch_size * num_heads, hidden_dim, height, width)
+            )
+            # batch_size, num_queries, num_heads, num_points, 2
+            # -> batch_size, num_heads, num_queries, num_points, 2
+            # -> batch_size*num_heads, num_queries, num_points, 2
+            sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+            # batch_size*num_heads, hidden_dim, num_queries, num_points
+            sampling_value_l_ = nn.functional.grid_sample(
+                value_l_,
+                sampling_grid_l_,
+                mode="bilinear",
+                padding_mode="zeros",
+                align_corners=False,
+            )
+            sampling_value_list.append(sampling_value_l_)
+        # (batch_size, num_queries, num_heads, num_levels, num_points)
+        # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+        # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+        attention_weights = attention_weights.transpose(1, 2).reshape(
+            batch_size * num_heads, 1, num_queries, num_levels * num_points
+        )
+        output = (
+            (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+            .sum(-1)
+            .view(batch_size, num_heads * hidden_dim, num_queries)
+        )
+        return output.transpose(1, 2).contiguous()
+
+
+class MMGroundingDinoLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        embedding_dim = config.d_model // 2
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+    def forward(self, pixel_values, pixel_mask=None):
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+class MMGroundingDinoMultiscaleDeformableAttention(nn.Module):
+    """
+    Multiscale deformable attention as proposed in Deformable DETR.
+    """
+
+    def __init__(self, config: MMGroundingDinoConfig, num_heads: int, n_points: int):
+        super().__init__()
+
+        self.attn = MultiScaleDeformableAttention()
+
+        if config.d_model % num_heads != 0:
+            raise ValueError(
+                f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
+            )
+        dim_per_head = config.d_model // num_heads
+        # check if dim_per_head is power of 2
+        if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
+            warnings.warn(
+                "You'd better set embed_dim (d_model) in MMGroundingDinoMultiscaleDeformableAttention to make the"
+                " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
+                " implementation."
+            )
+
+        self.im2col_step = 64
+
+        self.d_model = config.d_model
+        self.n_levels = config.num_feature_levels
+        self.n_heads = num_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
+        self.value_proj = nn.Linear(config.d_model, config.d_model)
+        self.output_proj = nn.Linear(config.d_model, config.d_model)
+
+        self.disable_custom_kernels = config.disable_custom_kernels
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        batch_size, num_queries, _ = hidden_states.shape
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+        # Ignore copy
+        if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+            raise ValueError(
+                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+            )
+
+        value = self.value_proj(encoder_hidden_states)
+        if attention_mask is not None:
+            # we invert the attention_mask
+            value = value.masked_fill(~attention_mask[..., None], float(0))
+        value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
+        )
+        attention_weights = self.attention_weights(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
+        )
+        attention_weights = F.softmax(attention_weights, -1).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
+        )
+        # batch_size, num_queries, n_heads, n_levels, n_points, 2
+        num_coordinates = reference_points.shape[-1]
+        if num_coordinates == 2:
+            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
+                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
+        elif num_coordinates == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            )
+        else:
+            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+
+        output = self.attn(
+            value,
+            spatial_shapes,
+            spatial_shapes_list,
+            level_start_index,
+            sampling_locations,
+            attention_weights,
+            self.im2col_step,
+        )
+
+        output = self.output_proj(output)
+
+        return output, attention_weights
+
+
+class MMGroundingDinoBiMultiHeadAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        vision_dim = text_dim = config.d_model
+        embed_dim = config.encoder_ffn_dim // 2
+        num_heads = config.encoder_attention_heads // 2
+        dropout = config.fusion_dropout
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.vision_dim = vision_dim
+        self.text_dim = text_dim
+
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"`embed_dim` must be divisible by `num_heads` (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+            )
+        self.scale = self.head_dim ** (-0.5)
+        self.dropout = dropout
+
+        self.vision_proj = nn.Linear(self.vision_dim, self.embed_dim)
+        self.text_proj = nn.Linear(self.text_dim, self.embed_dim)
+        self.values_vision_proj = nn.Linear(self.vision_dim, self.embed_dim)
+        self.values_text_proj = nn.Linear(self.text_dim, self.embed_dim)
+
+        self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim)
+        self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim)
+
+    def _reshape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        vision_features: torch.FloatTensor,
+        text_features: torch.FloatTensor,
+        vision_attention_mask: Optional[torch.BoolTensor] = None,
+        text_attention_mask: Optional[torch.BoolTensor] = None,
+    ) -> tuple[tuple[torch.FloatTensor, torch.FloatTensor], tuple[torch.FloatTensor, torch.FloatTensor]]:
+        """Image-to-text and text-to-image cross-attention
+
+        Args:
+            vision_features (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_dim)`):
+                Projected flattened image features generated by the vision backbone.
+            text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`):
+                Projected text features generated by the text encoder.
+            vision_attention_mask (`torch.BoolTensor`, **optional**):
+                Attention mask for image-to-text cross-attention. False for real tokens and True for padding tokens.
+            text_attention_mask (`torch.BoolTensor`, **optional**):
+                Attention mask for text-to-image cross-attention. False for real tokens and True for padding tokens.
+
+        Returns:
+            `tuple(tuple(torch.FloatTensor), tuple(torch.FloatTensor))` where each inner tuple comprises an attention
+            output and weights:
+            - **vision_attn_output** (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_din)`)
+              --
+                Output of the image-to-text cross-attention layer.
+            - **vision_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, vision_sequence_length,
+              vision_sequence_length)`) --
+                Attention weights of the image-to-text cross-attention layer.
+            - **text_attn_output** (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`) --
+                Output of the text-to-image cross-attention layer.
+            - **text_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, text_sequence_length,
+              text_sequence_length)`) --
+                Attention weights of the text-to-image cross-attention layer.
+        """
+        batch_size, tgt_len, _ = vision_features.size()
+
+        vision_query_states = self.vision_proj(vision_features) * self.scale
+        vision_query_states = self._reshape(vision_query_states, tgt_len, batch_size)
+
+        text_key_states = self.text_proj(text_features)
+        text_key_states = self._reshape(text_key_states, -1, batch_size)
+
+        vision_value_states = self.values_vision_proj(vision_features)
+        vision_value_states = self._reshape(vision_value_states, -1, batch_size)
+
+        text_value_states = self.values_text_proj(text_features)
+        text_value_states = self._reshape(text_value_states, -1, batch_size)
+
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+
+        vision_query_states = vision_query_states.view(*proj_shape)
+        text_key_states = text_key_states.view(*proj_shape)
+        vision_value_states = vision_value_states.view(*proj_shape)
+        text_value_states = text_value_states.view(*proj_shape)
+
+        src_len = text_key_states.size(1)
+        attn_weights = torch.bmm(vision_query_states, text_key_states.transpose(1, 2))  # bs*nhead, nimg, ntxt
+
+        if attn_weights.size() != (batch_size * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        attn_weights = attn_weights - attn_weights.max()
+        # Do not increase -50000/50000, data type half has quite limited range
+        attn_weights = torch.clamp(attn_weights, min=-50000, max=50000)
+
+        attn_weights_transposed = attn_weights.transpose(1, 2)
+        text_attn_weights = attn_weights_transposed - torch.max(attn_weights_transposed, dim=-1, keepdim=True)[0]
+
+        # Do not increase -50000/50000, data type half has quite limited range
+        text_attn_weights = torch.clamp(text_attn_weights, min=-50000, max=50000)
+
+        # mask vision for language
+        if vision_attention_mask is not None:
+            vision_attention_mask = (
+                vision_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            )
+            text_attn_weights.masked_fill_(vision_attention_mask, float("-inf"))
+
+        text_attn_weights = text_attn_weights.softmax(dim=-1)
+
+        # mask language for vision
+        if text_attention_mask is not None:
+            text_attention_mask = text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            attn_weights.masked_fill_(text_attention_mask, float("-inf"))
+        vision_attn_weights = attn_weights.softmax(dim=-1)
+
+        vision_attn_probs = F.dropout(vision_attn_weights, p=self.dropout, training=self.training)
+        text_attn_probs = F.dropout(text_attn_weights, p=self.dropout, training=self.training)
+
+        vision_attn_output = torch.bmm(vision_attn_probs, text_value_states)
+        text_attn_output = torch.bmm(text_attn_probs, vision_value_states)
+
+        if vision_attn_output.size() != (batch_size * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`vision_attn_output` should be of size {(batch_size, self.num_heads, tgt_len, self.head_dim)}, but is {vision_attn_output.size()}"
+            )
+
+        if text_attn_output.size() != (batch_size * self.num_heads, src_len, self.head_dim):
+            raise ValueError(
+                f"`text_attn_output` should be of size {(batch_size, self.num_heads, src_len, self.head_dim)}, but is {text_attn_output.size()}"
+            )
+
+        vision_attn_output = vision_attn_output.view(batch_size, self.num_heads, tgt_len, self.head_dim)
+        vision_attn_output = vision_attn_output.transpose(1, 2)
+        vision_attn_output = vision_attn_output.reshape(batch_size, tgt_len, self.embed_dim)
+
+        text_attn_output = text_attn_output.view(batch_size, self.num_heads, src_len, self.head_dim)
+        text_attn_output = text_attn_output.transpose(1, 2)
+        text_attn_output = text_attn_output.reshape(batch_size, src_len, self.embed_dim)
+
+        vision_attn_output = self.out_vision_proj(vision_attn_output)
+        text_attn_output = self.out_text_proj(text_attn_output)
+
+        return (vision_attn_output, vision_attn_weights), (text_attn_output, text_attn_weights)
+
+
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+class MMGroundingDinoDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class MMGroundingDinoFusionLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        drop_path = config.fusion_droppath
+
+        # pre layer norm
+        self.layer_norm_vision = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+        self.layer_norm_text = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+        self.attn = MMGroundingDinoBiMultiHeadAttention(config)
+
+        # add layer scale for training stability
+        self.drop_path = MMGroundingDinoDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        init_values = 1e-4
+        self.vision_param = nn.Parameter(init_values * torch.ones(config.d_model), requires_grad=True)
+        self.text_param = nn.Parameter(init_values * torch.ones(config.d_model), requires_grad=True)
+
+    def forward(
+        self,
+        vision_features: torch.FloatTensor,
+        text_features: torch.FloatTensor,
+        attention_mask_vision: Optional[torch.BoolTensor] = None,
+        attention_mask_text: Optional[torch.BoolTensor] = None,
+    ) -> tuple[tuple[torch.FloatTensor, torch.FloatTensor], tuple[torch.FloatTensor, torch.FloatTensor]]:
+        """Image and text features fusion
+
+        Args:
+            vision_features (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_dim)`):
+                Projected flattened image features generated by the vision backbone.
+            text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`):
+                Projected text features generated by the text encoder.
+            attention_mask_vision (`torch.BoolTensor`, **optional**):
+                Attention mask for image-to-text cross-attention. False for real tokens and True for padding tokens.
+            attention_mask_text (`torch.BoolTensor`, **optional**):
+                Attention mask for text-to-image cross-attention. False for real tokens and True for padding tokens.
+
+        Returns:
+            `tuple(tuple(torch.FloatTensor), tuple(torch.FloatTensor))` where each inner tuple comprises an enhanced
+            feature and attention output and weights:
+            - **vision_features** (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, vision_dim)`) --
+                Updated vision features with attention output from image-to-text cross-attention layer.
+            - **vision_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, vision_sequence_length,
+              vision_sequence_length)`) --
+                Attention weights of the image-to-text cross-attention layer.
+            - **text_features** (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, text_dim)`) --
+                Updated text features with attention output from text-to-image cross-attention layer.
+            - **text_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, text_sequence_length,
+              text_sequence_length)`) --
+                Attention weights of the text-to-image cross-attention layer.
+        """
+        vision_features = self.layer_norm_vision(vision_features)
+        text_features = self.layer_norm_text(text_features)
+        (delta_v, vision_attn), (delta_t, text_attn) = self.attn(
+            vision_features,
+            text_features,
+            vision_attention_mask=attention_mask_vision,
+            text_attention_mask=attention_mask_text,
+        )
+        vision_features = vision_features + self.drop_path(self.vision_param * delta_v)
+        text_features = text_features + self.drop_path(self.text_param * delta_t)
+
+        return (vision_features, vision_attn), (text_features, text_attn)
+
+
+@auto_docstring
+class MMGroundingDinoPreTrainedModel(PreTrainedModel):
+    config: MMGroundingDinoConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+
+        if isinstance(module, MMGroundingDinoLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        elif isinstance(module, MMGroundingDinoMultiscaleDeformableAttention):
+            nn.init.constant_(module.sampling_offsets.weight.data, 0.0)
+            default_dtype = torch.get_default_dtype()
+            thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * (
+                2.0 * math.pi / module.n_heads
+            )
+            grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+            grid_init = (
+                (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+                .view(module.n_heads, 1, 1, 2)
+                .repeat(1, module.n_levels, module.n_points, 1)
+            )
+            for i in range(module.n_points):
+                grid_init[:, :, i, :] *= i + 1
+            with torch.no_grad():
+                module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+            nn.init.constant_(module.attention_weights.weight.data, 0.0)
+            nn.init.constant_(module.attention_weights.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.value_proj.weight.data)
+            nn.init.constant_(module.value_proj.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.output_proj.weight.data)
+            nn.init.constant_(module.output_proj.bias.data, 0.0)
+        elif isinstance(module, MMGroundingDinoBiMultiHeadAttention):
+            nn.init.xavier_uniform_(module.vision_proj.weight)
+            module.vision_proj.bias.data.fill_(0)
+            nn.init.xavier_uniform_(module.text_proj.weight)
+            module.text_proj.bias.data.fill_(0)
+            nn.init.xavier_uniform_(module.values_vision_proj.weight)
+            module.values_vision_proj.bias.data.fill_(0)
+            nn.init.xavier_uniform_(module.values_text_proj.weight)
+            module.values_text_proj.bias.data.fill_(0)
+            nn.init.xavier_uniform_(module.out_vision_proj.weight)
+            module.out_vision_proj.bias.data.fill_(0)
+            nn.init.xavier_uniform_(module.out_text_proj.weight)
+            module.out_text_proj.bias.data.fill_(0)
+        elif isinstance(module, MMGroundingDinoFusionLayer):
+            module.vision_param.data.fill_(1e-4)
+            module.text_param.data.fill_(1e-4)
+        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, MMGroundingDinoMLPPredictionHead):
+            nn.init.constant_(module.layers[-1].weight.data, 0)
+            nn.init.constant_(module.layers[-1].bias.data, 0)
+
+        if hasattr(module, "reference_points") and not self.config.two_stage:
+            nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
+            nn.init.constant_(module.reference_points.bias.data, 0.0)
+        if hasattr(module, "level_embed"):
+            nn.init.normal_(module.level_embed)
+        if isinstance(module, MMGroundingDinoContrastiveEmbedding):
+            nn.init.constant_(module.bias, -math.log((1 - 0.01) / 0.01))
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, MMGroundingDinoDecoder):
+            module.gradient_checkpointing = value
+
+
+class MMGroundingDinoFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `MMGroundingDinoFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = MMGroundingDinoFrozenBatchNorm2d(module.num_features)
+
+            if module.weight.device != torch.device("meta"):
+                new_module.weight.data.copy_(module.weight)
+                new_module.bias.data.copy_(module.bias)
+                new_module.running_mean.data.copy_(module.running_mean)
+                new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+class MMGroundingDinoConvEncoder(nn.Module):
+    """
+    Convolutional backbone, using either the AutoBackbone API or one from the timm library.
+
+    nn.BatchNorm2d layers are replaced by MMGroundingDinoFrozenBatchNorm2d as defined above.
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+
+        if config.use_timm_backbone:
+            requires_backends(self, ["timm"])
+            backbone = create_model(
+                config.backbone,
+                pretrained=config.use_pretrained_backbone,
+                features_only=True,
+                **config.backbone_kwargs,
+            )
+        else:
+            backbone = load_backbone(config)
+
+        # replace batch norm by frozen batch norm
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = (
+            self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels
+        )
+
+        backbone_model_type = None
+        if config.backbone is not None:
+            backbone_model_type = config.backbone
+        elif config.backbone_config is not None:
+            backbone_model_type = config.backbone_config.model_type
+        else:
+            raise ValueError("Either `backbone` or `backbone_config` should be provided in the config")
+
+        if "resnet" in backbone_model_type:
+            for name, parameter in self.model.named_parameters():
+                if config.use_timm_backbone:
+                    if "layer2" not in name and "layer3" not in name and "layer4" not in name:
+                        parameter.requires_grad_(False)
+                else:
+                    if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name:
+                        parameter.requires_grad_(False)
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps
+
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+
+
+class MMGroundingDinoConvModel(nn.Module):
+    """
+    This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder.
+    """
+
+    def __init__(self, conv_encoder, position_embedding):
+        super().__init__()
+        self.conv_encoder = conv_encoder
+        self.position_embedding = position_embedding
+
+    def forward(self, pixel_values, pixel_mask):
+        # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples
+        out = self.conv_encoder(pixel_values, pixel_mask)
+        pos = []
+        for feature_map, mask in out:
+            # position encoding
+            pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype))
+
+        return out, pos
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the MMGroundingDinoEncoder. This class extends BaseModelOutput, due to:
+    - vision and text last hidden states
+    - vision and text intermediate hidden states
+    """
+)
+class MMGroundingDinoEncoderOutput(ModelOutput):
+    r"""
+    last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the vision encoder.
+    last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the text encoder.
+    vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
+        layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
+        output of each layer plus the initial embedding outputs.
+    text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
+        of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
+        each layer plus the initial embedding outputs.
+    """
+
+    last_hidden_state_vision: Optional[torch.FloatTensor] = None
+    last_hidden_state_text: Optional[torch.FloatTensor] = None
+    vision_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    text_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
+
+
+class MMGroundingDinoMultiheadAttention(nn.Module):
+    """Equivalent implementation of nn.MultiheadAttention with `batch_first=True`."""
+
+    def __init__(self, config, num_attention_heads=None):
+        super().__init__()
+        if config.hidden_size % num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({num_attention_heads})"
+            )
+
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = int(config.hidden_size / num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+    def forward(
+        self,
+        queries: torch.Tensor,
+        keys: torch.Tensor,
+        values: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        batch_size, seq_length, _ = queries.shape
+        query_layer = (
+            self.query(queries)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        key_layer = (
+            self.key(keys).view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
+        )
+        value_layer = (
+            self.value(values).view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2)
+        )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in MMGroundingDinoModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        context_layer = self.out_proj(context_layer)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class MMGroundingDinoTextEnhancerLayer(nn.Module):
+    """Vanilla Transformer with text embeddings as input"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.self_attn = MMGroundingDinoMultiheadAttention(
+            config, num_attention_heads=config.encoder_attention_heads // 2
+        )
+
+        # Implementation of Feedforward model
+        self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model)
+
+        self.layer_norm_before = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+        self.layer_norm_after = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+
+        self.activation = ACT2FN[config.activation_function]
+        self.num_heads = config.encoder_attention_heads // 2
+        self.dropout = config.text_enhancer_dropout
+
+    def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]):
+        return hidden_state if position_embeddings is None else hidden_state + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_masks: Optional[torch.BoolTensor] = None,
+        position_embeddings: Optional[torch.FloatTensor] = None,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
+        """Text self-attention to enhance projection of text features generated by
+        the text encoder (AutoModel based on text_config) within MMGroundingDinoEncoderLayer
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`):
+                Text features generated by the text encoder.
+            attention_masks (`torch.BoolTensor`, *optional*):
+                Attention mask for text self-attention. False for real tokens and True for padding tokens.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings to be added to the hidden states.
+
+        Returns:
+            `tuple(torch.FloatTensor)` comprising two elements:
+            - **hidden_states** (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`) --
+                Output of the text self-attention layer.
+            - **attention_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, sequence_length,
+              sequence_length)`) --
+                Attention weights of the text self-attention layer.
+        """
+
+        # repeat attn mask
+        if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]:
+            # batch_size, num_queries, num_keys
+            attention_masks = attention_masks[:, None, :, :]
+            attention_masks = attention_masks.repeat(1, self.num_heads, 1, 1)
+
+            dtype = hidden_states.dtype
+            attention_masks = attention_masks.to(dtype=dtype)  # fp16 compatibility
+            attention_masks = (1.0 - attention_masks) * torch.finfo(dtype).min
+
+        queries = keys = self.with_pos_embed(hidden_states, position_embeddings)
+        attention_output, attention_weights = self.self_attn(
+            queries=queries,
+            keys=keys,
+            values=hidden_states,
+            attention_mask=attention_masks,
+            output_attentions=True,
+        )
+        attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training)
+        hidden_states = hidden_states + attention_output
+        hidden_states = self.layer_norm_before(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = hidden_states + residual
+        hidden_states = self.layer_norm_after(hidden_states)
+
+        return hidden_states, attention_weights
+
+
+class MMGroundingDinoDeformableLayer(nn.Module):
+    def __init__(self, config: MMGroundingDinoConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = MMGroundingDinoMultiscaleDeformableAttention(
+            config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Input to the layer.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+                Attention mask.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings, to be added to `hidden_states`.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes of the backbone feature maps.
+            spatial_shapes_list (`list[tuple[int, int]]`, *optional*):
+                Spatial shapes of the backbone feature maps (but as list for export compatibility).
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        return hidden_states, attn_weights
+
+
+# Based on https://github.com/IDEA-Research/MMGroundingDino/blob/2b62f419c292ca9c518daae55512fabc3fead4a4/MMGroundingDino/models/MMGroundingDino/utils.py#L24
+def get_sine_pos_embed(
+    pos_tensor: torch.Tensor, num_pos_feats: int = 128, temperature: int = 10000, exchange_xy: bool = True
+) -> Tensor:
+    """
+    Generate sine position embeddings from a position tensor.
+
+    Args:
+        pos_tensor (torch.Tensor):
+            Tensor containing positions. Shape: [..., n].
+        num_pos_feats (`int`, *optional*, defaults to 128):
+            Projected shape for each float in the tensor.
+        temperature (`int`, *optional*, defaults to 10000):
+            Temperature in the sine/cosine function.
+        exchange_xy (`bool`, *optional*, defaults to `True`):
+            Exchange pos x and pos y. For example, input tensor is [x,y], the results will be [pos(y), pos(x)].
+
+    Returns:
+        position_embeddings (torch.Tensor): shape: [..., n * hidden_size].
+    """
+    scale = 2 * math.pi
+    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+
+    def sine_func(x: torch.Tensor):
+        sin_x = x * scale / dim_t
+        sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2)
+        return sin_x
+
+    pos_tensor = pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)
+    position_embeddings = [sine_func(x) for x in pos_tensor]
+    if exchange_xy:
+        position_embeddings[0], position_embeddings[1] = position_embeddings[1], position_embeddings[0]
+    position_embeddings = torch.cat(position_embeddings, dim=-1)
+    return position_embeddings
+
+
+class MMGroundingDinoEncoderLayer(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+
+        self.d_model = config.d_model
+
+        self.text_enhancer_layer = MMGroundingDinoTextEnhancerLayer(config)
+        self.fusion_layer = MMGroundingDinoFusionLayer(config)
+        self.deformable_layer = MMGroundingDinoDeformableLayer(config)
+
+    def get_text_position_embeddings(
+        self,
+        text_features: Tensor,
+        text_position_embedding: Optional[torch.Tensor],
+        text_position_ids: Optional[torch.Tensor],
+    ) -> Tensor:
+        batch_size, seq_length, _ = text_features.shape
+        if text_position_embedding is None and text_position_ids is None:
+            text_position_embedding = torch.arange(seq_length, device=text_features.device)
+            text_position_embedding = text_position_embedding.float()
+            text_position_embedding = text_position_embedding.unsqueeze(0).unsqueeze(-1)
+            text_position_embedding = text_position_embedding.repeat(batch_size, 1, 1)
+            text_position_embedding = get_sine_pos_embed(
+                text_position_embedding, num_pos_feats=self.d_model, exchange_xy=False
+            )
+        if text_position_ids is not None:
+            text_position_embedding = get_sine_pos_embed(
+                text_position_ids[..., None], num_pos_feats=self.d_model, exchange_xy=False
+            )
+
+        return text_position_embedding
+
+    def forward(
+        self,
+        vision_features: Tensor,
+        vision_position_embedding: Tensor,
+        spatial_shapes: Tensor,
+        spatial_shapes_list: list[tuple[int, int]],
+        level_start_index: Tensor,
+        key_padding_mask: Tensor,
+        reference_points: Tensor,
+        text_features: Optional[Tensor] = None,
+        text_attention_mask: Optional[Tensor] = None,
+        text_position_embedding: Optional[Tensor] = None,
+        text_self_attention_masks: Optional[Tensor] = None,
+        text_position_ids: Optional[Tensor] = None,
+    ):
+        text_position_embedding = self.get_text_position_embeddings(
+            text_features, text_position_embedding, text_position_ids
+        )
+
+        (vision_features, vision_fused_attn), (text_features, text_fused_attn) = self.fusion_layer(
+            vision_features=vision_features,
+            text_features=text_features,
+            attention_mask_vision=key_padding_mask,
+            attention_mask_text=text_attention_mask,
+        )
+
+        (text_features, text_enhanced_attn) = self.text_enhancer_layer(
+            hidden_states=text_features,
+            attention_masks=~text_self_attention_masks,  # note we use ~ for mask here
+            position_embeddings=(text_position_embedding if text_position_embedding is not None else None),
+        )
+
+        (vision_features, vision_deformable_attn) = self.deformable_layer(
+            hidden_states=vision_features,
+            attention_mask=~key_padding_mask,
+            position_embeddings=vision_position_embedding,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+        )
+
+        return (
+            (vision_features, text_features),
+            (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn),
+        )
+
+
+class MMGroundingDinoEncoder(MMGroundingDinoPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
+    [`MMGroundingDinoEncoderLayer`].
+
+    The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
+
+    Args:
+        config: MMGroundingDinoConfig
+    """
+
+    def __init__(self, config: MMGroundingDinoConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([MMGroundingDinoEncoderLayer(config) for _ in range(config.encoder_layers)])
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """
+        Get reference points for each feature map.
+
+        Args:
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Valid ratios of each feature map.
+            device (`torch.device`):
+                Device on which to create the tensors.
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
+        """
+        reference_points_list = []
+        for level, (height, width) in enumerate(spatial_shapes):
+            ref_y, ref_x = meshgrid(
+                torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
+                torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
+                indexing="ij",
+            )
+            # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(
+        self,
+        vision_features: Tensor,
+        vision_attention_mask: Tensor,
+        vision_position_embedding: Tensor,
+        spatial_shapes: Tensor,
+        spatial_shapes_list: list[tuple[int, int]],
+        level_start_index: Tensor,
+        valid_ratios=None,
+        text_features: Optional[Tensor] = None,
+        text_attention_mask: Optional[Tensor] = None,
+        text_position_embedding: Optional[Tensor] = None,
+        text_self_attention_masks: Optional[Tensor] = None,
+        text_position_ids: Optional[Tensor] = None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+            vision_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 0 for pixel features that are real (i.e. **not masked**),
+                - 1 for pixel features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            vision_position_embedding (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            spatial_shapes_list (`list[tuple[int, int]]`):
+                Spatial shapes of each feature map (but as list for export compatibility).
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
+                Starting index of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Ratio of valid area in each feature level.
+            text_features (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`):
+                Flattened text features that are passed to the encoder.
+            text_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*):
+                Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`:
+                - 0 for text features that are real (i.e. **not masked**),
+                - 1 for text features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            text_position_embedding (`torch.FloatTensor` of shape `(batch_size, text_seq_len)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            text_self_attention_masks (`torch.BoolTensor` of shape `(batch_size, text_seq_len, text_seq_len)`):
+                Masks to avoid performing attention between padding text features. Mask values selected in `[0, 1]`:
+                - 1 for text features that are real (i.e. **not masked**),
+                - 0 for text features that are padding (i.e. **masked**).
+            text_position_ids (`torch.LongTensor` of shape `(batch_size, num_queries)`):
+                Position ids for text features.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device)
+
+        encoder_vision_states = () if output_hidden_states else None
+        encoder_text_states = () if output_hidden_states else None
+        all_attns = () if output_attentions else None
+        all_attn_fused_text = () if output_attentions else None
+        all_attn_fused_vision = () if output_attentions else None
+        all_attn_enhanced_text = () if output_attentions else None
+        all_attn_deformable = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_vision_states += (vision_features,)
+                encoder_text_states += (text_features,)
+
+            (vision_features, text_features), attentions = encoder_layer(
+                vision_features=vision_features,
+                vision_position_embedding=vision_position_embedding,
+                spatial_shapes=spatial_shapes,
+                spatial_shapes_list=spatial_shapes_list,
+                level_start_index=level_start_index,
+                key_padding_mask=vision_attention_mask,
+                reference_points=reference_points,
+                text_features=text_features,
+                text_attention_mask=text_attention_mask,
+                text_position_embedding=text_position_embedding,
+                text_self_attention_masks=text_self_attention_masks,
+                text_position_ids=text_position_ids,
+            )
+
+            if output_attentions:
+                all_attn_fused_vision += (attentions[0],)
+                all_attn_fused_text += (attentions[1],)
+                all_attn_enhanced_text += (attentions[2],)
+                all_attn_deformable += (attentions[3],)
+
+        if output_hidden_states:
+            encoder_vision_states += (vision_features,)
+            encoder_text_states += (text_features,)
+
+        if output_attentions:
+            all_attns = (all_attn_fused_vision, all_attn_fused_text, all_attn_enhanced_text, all_attn_deformable)
+
+        if not return_dict:
+            enc_outputs = [vision_features, text_features, encoder_vision_states, encoder_text_states, all_attns]
+            return tuple(v for v in enc_outputs if v is not None)
+        return MMGroundingDinoEncoderOutput(
+            last_hidden_state_vision=vision_features,
+            last_hidden_state_text=text_features,
+            vision_hidden_states=encoder_vision_states,
+            text_hidden_states=encoder_text_states,
+            attentions=all_attns,
+        )
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the MMGroundingDinoDecoder. This class adds two attributes to
+    BaseModelOutputWithCrossAttentions, namely:
+    - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
+    - a stacked tensor of intermediate reference points.
+    """
+)
+class MMGroundingDinoDecoderOutput(ModelOutput):
+    r"""
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+        Stacked intermediate hidden states (output of each layer of the decoder).
+    intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
+        Stacked intermediate reference points (reference points of each layer of the decoder).
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
+
+
+class MMGroundingDinoDecoderLayer(nn.Module):
+    def __init__(self, config: MMGroundingDinoConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        # self-attention
+        self.self_attn = MMGroundingDinoMultiheadAttention(config, num_attention_heads=config.decoder_attention_heads)
+
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
+        # cross-attention text
+        self.encoder_attn_text = MMGroundingDinoMultiheadAttention(
+            config, num_attention_heads=config.decoder_attention_heads
+        )
+        self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
+        # cross-attention
+        self.encoder_attn = MMGroundingDinoMultiscaleDeformableAttention(
+            config,
+            num_heads=config.decoder_attention_heads,
+            n_points=config.decoder_n_points,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
+        # feedforward neural networks
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps)
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        vision_encoder_hidden_states: Optional[torch.Tensor] = None,
+        vision_encoder_attention_mask: Optional[torch.Tensor] = None,
+        text_encoder_hidden_states: Optional[torch.Tensor] = None,
+        text_encoder_attention_mask: Optional[torch.Tensor] = None,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        residual = hidden_states
+
+        # Self Attention
+        queries = keys = self.with_pos_embed(hidden_states, position_embeddings)
+        hidden_states, self_attn_weights = self.self_attn(
+            queries=queries,
+            keys=keys,
+            values=hidden_states,
+            attention_mask=self_attn_mask,
+            output_attentions=True,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        second_residual = hidden_states
+
+        # Cross-Attention Text
+        queries = self.with_pos_embed(hidden_states, position_embeddings)
+        hidden_states, text_cross_attn_weights = self.encoder_attn_text(
+            queries=queries,
+            keys=text_encoder_hidden_states,
+            values=text_encoder_hidden_states,
+            attention_mask=text_encoder_attention_mask,
+            output_attentions=True,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = second_residual + hidden_states
+        hidden_states = self.encoder_attn_text_layer_norm(hidden_states)
+
+        third_residual = hidden_states
+
+        # Cross-Attention
+        cross_attn_weights = None
+        hidden_states, cross_attn_weights = self.encoder_attn(
+            hidden_states=hidden_states,
+            attention_mask=vision_encoder_attention_mask,
+            encoder_hidden_states=vision_encoder_hidden_states,
+            encoder_attention_mask=vision_encoder_attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = third_residual + hidden_states
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, text_cross_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+class MMGroundingDinoDecoder(MMGroundingDinoPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MMGroundingDinoDecoderLayer`].
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some tweaks for Grounding DINO:
+
+    - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
+    - it also returns a stack of intermediate outputs and reference points from all decoding layers.
+
+    Args:
+        config: MMGroundingDinoConfig
+    """
+
+    def __init__(self, config: MMGroundingDinoConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layer_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+        self.layers = nn.ModuleList([MMGroundingDinoDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.reference_points_head = MMGroundingDinoMLPPredictionHead(
+            config.query_dim // 2 * config.d_model, config.d_model, config.d_model, 2
+        )
+        self.gradient_checkpointing = False
+
+        # hack implementation for iterative bounding box refinement as in two-stage Deformable DETR
+        self.bbox_embed = None
+        self.class_embed = None
+        self.query_scale = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds,
+        vision_encoder_hidden_states,
+        vision_encoder_attention_mask=None,
+        text_encoder_hidden_states=None,
+        text_encoder_attention_mask=None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        valid_ratios=None,
+        self_attn_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            vision_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Last hidden state from encoder related to vision feature map.
+            vision_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+            text_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`):
+                Last hidden state from encoder related to text features.
+            text_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*):
+                Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`:
+                - 0 for text features that are real (i.e. **not masked**),
+                - 1 for text features that are padding (i.e. **masked**).
+            reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
+                Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
+            spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of the feature maps.
+            spatial_shapes_list (`list[tuple[int, int]]`):
+                Spatial shapes of the feature maps (but as list for export compatibility).
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
+                Indexes for the start of each feature level. In range `[0, sequence_length]`.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
+                Ratio of valid area in each feature level.
+            self_attn_mask (`torch.BoolTensor` of shape `(batch_size, text_seq_len)`):
+                Masks to avoid performing self-attention between vision hidden state. Mask values selected in `[0, 1]`:
+                - 1 for queries that are real (i.e. **not masked**),
+                - 0 for queries that are padding (i.e. **masked**).
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_attns = () if output_attentions else None
+        all_cross_attns_vision = () if (output_attentions and vision_encoder_hidden_states is not None) else None
+        all_cross_attns_text = () if (output_attentions and text_encoder_hidden_states is not None) else None
+        intermediate = ()
+        intermediate_reference_points = ()
+
+        if text_encoder_attention_mask is not None:
+            dtype = text_encoder_hidden_states.dtype
+
+            text_encoder_attention_mask = text_encoder_attention_mask[:, None, None, :]
+            text_encoder_attention_mask = text_encoder_attention_mask.repeat(
+                1, self.config.decoder_attention_heads, self.config.num_queries, 1
+            )
+            text_encoder_attention_mask = text_encoder_attention_mask.to(dtype=dtype)
+            text_encoder_attention_mask = text_encoder_attention_mask * torch.finfo(dtype).min
+
+        for idx, decoder_layer in enumerate(self.layers):
+            num_coordinates = reference_points.shape[-1]
+            if num_coordinates == 4:
+                reference_points_input = (
+                    reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+                )
+            elif num_coordinates == 2:
+                reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+            else:
+                raise ValueError("Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+            query_pos = get_sine_pos_embed(reference_points_input[:, :, 0, :], num_pos_feats=self.config.d_model // 2)
+            query_pos = self.reference_points_head(query_pos)
+
+            # In original implementation they apply layer norm before outputting intermediate hidden states
+            # Though that's not through between layers so the layers use as input the output of the previous layer
+            # without layer norm
+            if output_hidden_states:
+                all_hidden_states += (self.layer_norm(hidden_states),)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    query_pos,
+                    reference_points_input,
+                    spatial_shapes,
+                    level_start_index,
+                    vision_encoder_hidden_states,
+                    vision_encoder_attention_mask,
+                    text_encoder_hidden_states,
+                    text_encoder_attention_mask,
+                    self_attn_mask,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states=hidden_states,
+                    position_embeddings=query_pos,
+                    reference_points=reference_points_input,
+                    spatial_shapes=spatial_shapes,
+                    spatial_shapes_list=spatial_shapes_list,
+                    level_start_index=level_start_index,
+                    vision_encoder_hidden_states=vision_encoder_hidden_states,
+                    vision_encoder_attention_mask=vision_encoder_attention_mask,
+                    text_encoder_hidden_states=text_encoder_hidden_states,
+                    text_encoder_attention_mask=text_encoder_attention_mask,
+                    self_attn_mask=self_attn_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                tmp = self.bbox_embed[idx](hidden_states)
+                num_coordinates = reference_points.shape[-1]
+                if num_coordinates == 4:
+                    new_reference_points = tmp + torch.special.logit(reference_points, eps=1e-5)
+                    new_reference_points = new_reference_points.sigmoid()
+                elif num_coordinates == 2:
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[..., :2] + torch.special.logit(reference_points, eps=1e-5)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    raise ValueError(
+                        f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}"
+                    )
+                reference_points = new_reference_points.detach()
+
+            intermediate += (self.layer_norm(hidden_states),)
+            intermediate_reference_points += (reference_points,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if text_encoder_hidden_states is not None:
+                    all_cross_attns_text += (layer_outputs[2],)
+
+                if vision_encoder_hidden_states is not None:
+                    all_cross_attns_vision += (layer_outputs[3],)
+
+        # Keep batch_size as first dimension
+        intermediate = torch.stack(intermediate, dim=1)
+        intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if output_attentions:
+            all_attns += (all_self_attns, all_cross_attns_text, all_cross_attns_vision)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    intermediate,
+                    intermediate_reference_points,
+                    all_hidden_states,
+                    all_attns,
+                ]
+                if v is not None
+            )
+        return MMGroundingDinoDecoderOutput(
+            last_hidden_state=hidden_states,
+            intermediate_hidden_states=intermediate,
+            intermediate_reference_points=intermediate_reference_points,
+            hidden_states=all_hidden_states,
+            attentions=all_attns,
+        )
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the Grounding DINO encoder-decoder model.
+    """
+)
+class MMGroundingDinoModelOutput(ModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+        Initial reference points sent through the Transformer decoder.
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+        Stacked intermediate hidden states (output of each layer of the decoder).
+    intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate reference points (reference points of each layer of the decoder).
+    encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of the last layer of the encoder of the model.
+    encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of the last layer of the encoder of the model.
+    encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
+        layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
+        output of each layer plus the initial embedding outputs.
+    encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
+        of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
+        each layer plus the initial embedding outputs.
+    encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+        Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads,
+        sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the
+        weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and
+        multi-scale deformable attention heads. attention softmax, used to compute the weighted average in the
+        bi-attention heads.
+    enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
+        Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
+        region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and
+        background).
+    enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the first stage.
+    encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
+        Logits of top `config.num_queries` scoring bounding boxes in the first stage.
+    encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+        Coordinates of top `config.num_queries` scoring bounding boxes in the first stage.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None
+    encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None
+    encoder_vision_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_text_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    encoder_logits: Optional[torch.FloatTensor] = None
+    encoder_pred_boxes: Optional[torch.FloatTensor] = None
+
+
+class MMGroundingDinoSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.embedding_dim = config.d_model // 2
+        self.temperature = config.positional_embedding_temperature
+        self.scale = 2 * math.pi
+
+    def forward(self, pixel_values, pixel_mask):
+        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        eps = 1e-6
+        y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+        x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device)
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+def build_position_encoding(config):
+    if config.position_embedding_type == "sine":
+        position_embedding = MMGroundingDinoSinePositionEmbedding(config)
+    elif config.position_embedding_type == "learned":
+        position_embedding = MMGroundingDinoLearnedPositionEmbedding(config)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+
+    return position_embedding
+
+
+# these correspond to [CLS], [SEP], . and ?
+SPECIAL_TOKENS = [101, 102, 1012, 1029]
+
+
+def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> tuple[Tensor, Tensor]:
+    """Generate attention mask between each pair of special tokens and positional ids.
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+    Returns:
+        `tuple(torch.Tensor)` comprising attention mask between each special tokens and position_ids:
+        - **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`)
+        - **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`)
+    """
+    batch_size, num_token = input_ids.shape
+    # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((batch_size, num_token), device=input_ids.device).bool()
+    for special_token in SPECIAL_TOKENS:
+        special_tokens_mask = torch.logical_or(special_tokens_mask, input_ids == special_token)
+
+    # idxs: each row is a list of indices of special tokens
+    idxs = torch.nonzero(special_tokens_mask)
+
+    # generate attention mask and positional ids
+    attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1)
+    position_ids = torch.zeros((batch_size, num_token), device=input_ids.device)
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = torch.arange(
+                0, col - previous_col, device=input_ids.device
+            )
+
+        previous_col = col
+
+    return attention_mask, position_ids.to(torch.long)
+
+
+@auto_docstring(
+    custom_intro="""
+    The bare Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) outputting raw
+    hidden-states without any specific head on top.
+    """
+)
+class MMGroundingDinoModel(MMGroundingDinoPreTrainedModel):
+    def __init__(self, config: MMGroundingDinoConfig):
+        super().__init__(config)
+
+        # Create backbone + positional encoding
+        backbone = MMGroundingDinoConvEncoder(config)
+        position_embeddings = build_position_encoding(config)
+        self.backbone = MMGroundingDinoConvModel(backbone, position_embeddings)
+
+        # Create input projection layers
+        num_backbone_outs = len(backbone.intermediate_channel_sizes)
+        input_proj_list = []
+        for i in range(num_backbone_outs):
+            in_channels = backbone.intermediate_channel_sizes[i]
+            input_proj_list.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels, config.d_model, kernel_size=1),
+                    nn.GroupNorm(32, config.d_model),
+                )
+            )
+        for _ in range(config.num_feature_levels - num_backbone_outs):
+            input_proj_list.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
+                    nn.GroupNorm(32, config.d_model),
+                )
+            )
+            in_channels = config.d_model
+        self.input_proj_vision = nn.ModuleList(input_proj_list)
+
+        # Create text backbone
+        self.text_backbone = AutoModel.from_config(config.text_config, add_pooling_layer=False)
+        self.text_projection = nn.Linear(config.text_config.hidden_size, config.d_model)
+
+        if config.embedding_init_target or not config.two_stage:
+            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model)
+
+        self.encoder = MMGroundingDinoEncoder(config)
+        self.decoder = MMGroundingDinoDecoder(config)
+
+        self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
+
+        self.enc_output = nn.Linear(config.d_model, config.d_model)
+        self.enc_output_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps)
+        self.encoder_output_bbox_embed = MMGroundingDinoMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        )
+        self.encoder_output_class_embed = MMGroundingDinoContrastiveEmbedding(config)
+
+        self.post_init()
+
+    def freeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.conv_encoder.model.named_parameters():
+            param.requires_grad_(True)
+
+    def get_valid_ratio(self, mask):
+        """Get the valid ratio of all feature maps."""
+
+        _, height, width = mask.shape
+        valid_height = torch.sum(mask[:, :, 0], 1)
+        valid_width = torch.sum(mask[:, 0, :], 1)
+        valid_ratio_height = valid_height.float() / height
+        valid_ratio_width = valid_width.float() / width
+        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
+        return valid_ratio
+
+    def generate_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
+        """Generate the encoder output proposals from encoded enc_output.
+
+        Args:
+            enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder.
+            padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`.
+            spatial_shapes (`torch.Tensor[num_feature_levels, 2]`): Spatial shapes of the feature maps.
+
+        Returns:
+            `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
+                - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
+                  directly predict a bounding box. (without the need of a decoder)
+                - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
+                  sigmoid.
+        """
+        batch_size = enc_output.shape[0]
+        proposals = []
+        current_position = 0
+        for level, (height, width) in enumerate(spatial_shapes):
+            mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)]
+            mask_flatten_ = mask_flatten_.view(batch_size, height, width, 1)
+            valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = meshgrid(
+                torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device),
+                torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device),
+                indexing="ij",
+            )
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
+            width_height = torch.ones_like(grid) * 0.05 * (2.0**level)
+            proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4)
+            proposals.append(proposal)
+            current_position += height * width
+
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))  # inverse sigmoid
+        output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
+
+        # assign each pixel as an object query
+        object_query = enc_output
+        object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
+        object_query = object_query.masked_fill(~output_proposals_valid, float(0))
+        object_query = self.enc_output_norm(self.enc_output(object_query))
+        return object_query, output_proposals
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Tensor,
+        input_ids: Tensor,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        pixel_mask: Optional[Tensor] = None,
+        encoder_outputs=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`BertTokenizer.__call__`] for details.
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`: 0 corresponds to a `sentence A` token, 1 corresponds to a `sentence B` token
+
+            [What are token type IDs?](../glossary#token-type-ids)
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "a cat."
+
+        >>> processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny")
+        >>> model = AutoModel.from_pretrained("IDEA-Research/grounding-dino-tiny")
+
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 900, 256]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids)
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        text_token_mask = attention_mask.bool()  # just to avoid renaming everywhere
+
+        max_text_len = self.config.max_text_len
+        if text_self_attention_masks.shape[1] > max_text_len:
+            text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
+            position_ids = position_ids[:, :max_text_len]
+            input_ids = input_ids[:, :max_text_len]
+            token_type_ids = token_type_ids[:, :max_text_len]
+            text_token_mask = text_token_mask[:, :max_text_len]
+
+        # Extract text features from text backbone
+        text_outputs = self.text_backbone(
+            input_ids, text_self_attention_masks, token_type_ids, position_ids, return_dict=return_dict
+        )
+        text_features = text_outputs.last_hidden_state if return_dict else text_outputs[0]
+        text_features = self.text_projection(text_features)
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
+
+        # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # which is a list of tuples
+        vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+
+        # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        feature_maps = []
+        masks = []
+        for level, (source, mask) in enumerate(vision_features):
+            feature_maps.append(self.input_proj_vision[level](source))
+            masks.append(mask)
+
+        # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
+        if self.config.num_feature_levels > len(feature_maps):
+            _len_sources = len(feature_maps)
+            for level in range(_len_sources, self.config.num_feature_levels):
+                if level == _len_sources:
+                    source = self.input_proj_vision[level](vision_features[-1][0])
+                else:
+                    source = self.input_proj_vision[level](feature_maps[-1])
+                mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
+                feature_maps.append(source)
+                masks.append(mask)
+                position_embeddings_list.append(pos_l)
+
+        # Create queries
+        query_embeds = None
+        if self.config.embedding_init_target or self.config.two_stage:
+            query_embeds = self.query_position_embeddings.weight
+
+        # Prepare encoder inputs (by flattening)
+        source_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes_list = []
+        for level, (source, mask, pos_embed) in enumerate(zip(feature_maps, masks, position_embeddings_list)):
+            batch_size, num_channels, height, width = source.shape
+            spatial_shape = (height, width)
+            spatial_shapes_list.append(spatial_shape)
+            source = source.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            source_flatten.append(source)
+            mask_flatten.append(mask)
+        source_flatten = torch.cat(source_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=source_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        valid_ratios = valid_ratios.float()
+
+        # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
+        # Also provide spatial_shapes, level_start_index and valid_ratios
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                vision_features=source_flatten,
+                vision_attention_mask=~mask_flatten,
+                vision_position_embedding=lvl_pos_embed_flatten,
+                spatial_shapes=spatial_shapes,
+                spatial_shapes_list=spatial_shapes_list,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                text_features=text_features,
+                text_attention_mask=~text_token_mask,
+                text_position_embedding=None,
+                text_self_attention_masks=~text_self_attention_masks,
+                text_position_ids=position_ids,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a MMGroundingDinoEncoderOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, MMGroundingDinoEncoderOutput):
+            encoder_outputs = MMGroundingDinoEncoderOutput(
+                last_hidden_state_vision=encoder_outputs[0],
+                last_hidden_state_text=encoder_outputs[1],
+                vision_hidden_states=encoder_outputs[2] if output_hidden_states else None,
+                text_hidden_states=encoder_outputs[3] if output_hidden_states else None,
+                attentions=encoder_outputs[-1] if output_attentions else None,
+            )
+
+        # Fifth, prepare decoder inputs
+        topk_proposals = None
+        enc_outputs_class = None
+        enc_outputs_coord_logits = None
+        encoder_logits = None
+        encoder_pred_boxes = None
+        if self.config.two_stage:
+            object_query_embedding, output_proposals = self.generate_encoder_output_proposals(
+                encoder_outputs[0], ~mask_flatten, spatial_shapes
+            )
+
+            # hack implementation as in two-stage Deformable DETR
+            # apply a detection head to each pixel (A.4 in paper)
+            # linear projection for bounding box binary classification (i.e. foreground and background)
+            enc_outputs_class = self.encoder_output_class_embed(
+                object_query_embedding, encoder_outputs[1], text_token_mask
+            )
+            # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
+            delta_bbox = self.encoder_output_bbox_embed(object_query_embedding)
+            enc_outputs_coord_logits = delta_bbox + output_proposals
+
+            # only keep top scoring `config.num_queries` proposals
+            topk = self.config.num_queries
+            topk_logits = enc_outputs_class.max(-1)[0]
+            topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]
+            topk_coords_logits = torch.gather(
+                enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
+            )
+
+            topk_coords_logits = topk_coords_logits.detach()
+            reference_points = topk_coords_logits.sigmoid()
+            init_reference_points = reference_points
+            if query_embeds is not None:
+                target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
+            else:
+                target = torch.gather(
+                    object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model)
+                ).detach()
+
+            # Set intermediate topk proposals (coords and class) for loss computation
+            encoder_pred_boxes = reference_points
+            encoder_logits = self.encoder_output_class_embed(target, text_features, text_token_mask)
+        else:
+            target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
+            reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid()
+            init_reference_points = reference_points
+
+        decoder_outputs = self.decoder(
+            inputs_embeds=target,
+            vision_encoder_hidden_states=encoder_outputs[0],
+            vision_encoder_attention_mask=mask_flatten,
+            text_encoder_hidden_states=encoder_outputs[1],
+            text_encoder_attention_mask=~text_token_mask,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            self_attn_mask=None,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            enc_outputs = tuple(
+                value
+                for value in [
+                    enc_outputs_class,
+                    enc_outputs_coord_logits,
+                    encoder_logits,
+                    encoder_pred_boxes,
+                ]
+                if value is not None
+            )
+            tuple_outputs = (
+                (decoder_outputs[0], init_reference_points) + decoder_outputs[1:] + encoder_outputs + enc_outputs
+            )
+
+            return tuple_outputs
+
+        return MMGroundingDinoModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            init_reference_points=init_reference_points,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            intermediate_reference_points=decoder_outputs.intermediate_reference_points,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision,
+            encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text,
+            encoder_vision_hidden_states=encoder_outputs.vision_hidden_states,
+            encoder_text_hidden_states=encoder_outputs.text_hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord_logits=enc_outputs_coord_logits,
+            encoder_logits=encoder_logits,
+            encoder_pred_boxes=encoder_pred_boxes,
+        )
+
+
+class MMGroundingDinoMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`MMGroundingDinoForObjectDetection`].
+    """
+)
+class MMGroundingDinoObjectDetectionOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+        scale-invariant IoU loss.
+    loss_dict (`Dict`, *optional*):
+        A dictionary containing the individual losses. Useful for logging.
+    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+        Classification logits (including no-object) for all queries.
+    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+        possible padding). You can use [`~MMGroundingDinoProcessor.post_process_grounded_object_detection`] to retrieve the
+        unnormalized bounding boxes.
+    auxiliary_outputs (`list[Dict]`, *optional*):
+        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+        `pred_boxes`) for each decoder layer.
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+        Initial reference points sent through the Transformer decoder.
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+        Stacked intermediate hidden states (output of each layer of the decoder).
+    intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate reference points (reference points of each layer of the decoder).
+    encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of the last layer of the encoder of the model.
+    encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+        Sequence of hidden-states at the output of the last layer of the encoder of the model.
+    encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each
+        layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the
+        output of each layer plus the initial embedding outputs.
+    encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer)
+        of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of
+        each layer plus the initial embedding outputs.
+    enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
+        Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as
+        region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and
+        background).
+    enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the first stage.
+    encoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`):
+        Logits of top `config.num_queries` scoring bounding boxes in the first stage.
+    encoder_pred_boxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+        Coordinates of top `config.num_queries` scoring bounding boxes in the first stage.
+    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Encoded candidate labels sequence. Used in processor to post process object detection result.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[dict] = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    auxiliary_outputs: Optional[list[dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None
+    encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None
+    encoder_vision_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_text_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    encoder_logits: Optional[torch.FloatTensor] = None
+    encoder_pred_boxes: Optional[torch.FloatTensor] = None
+    input_ids: Optional[torch.LongTensor] = None
+
+
+def build_label_maps(logits: torch.FloatTensor, input_ids: torch.LongTensor) -> tuple[torch.FloatTensor]:
+    """
+    Computes a mapping between tokens and their corresponding labels, where `num_labels` is determined by the number of classes in the input prompt.
+    The function identifies segments of tokens between specific delimiter tokens and generates label maps for those segments.
+    Args:
+        logits (`torch.Tensor` of shape `(batch_size, seq_length, hidden_size)`):
+            The output logits from the model, where `hidden_size` corresponds to the dimension of the model's output features.
+
+        input_ids (`torch.Tensor` of shape `(batch_size, seq_length)`):
+            The input token IDs corresponding to the input prompt. For example, given the prompt "fish. shark.",
+            `input_ids` might look like `[101, 3869, 1012, 11420, 1012, 102]` where each number corresponds to a token including special tokens.
+    Returns:
+        tuple: A tuple containing label maps for each instance in the batch.
+        - label_maps (tuple of `torch.Tensor`):
+            A tuple of tensors, where each tensor in the tuple corresponds to an instance in the batch. Each tensor
+            has shape `(num_labels, hidden_size)` and contains binary values (0 or 1), where `1` indicates the tokens
+            that are associated with a specific label (class) between delimiter tokens, and `0` elsewhere.
+    Example:
+        Given an input prompt "fish. shark." and corresponding `input_ids` as `[101, 3869, 1012, 11420, 1012, 102]`:
+        - The function identifies the tokens for "fish" (IDs `[3869]`) and "shark" (IDs `[11420]`).
+        - The function then constructs label maps for these tokens, where each label map indicates which tokens
+          correspond to which label between the delimiter tokens (e.g., between the period `.`).
+        - The output is a tuple of label maps, one for each instance in the batch.
+    Note:
+        - `SPECIAL_TOKENS` should be a predefined list of tokens that are considered special (e.g., `[CLS]`, `[SEP]`, etc.).
+    """
+    max_seq_len = logits.shape[-1]
+    # Add [PAD] token to the list of special tokens
+    delimiter_tokens = torch.tensor(SPECIAL_TOKENS + [0], device=input_ids.device)
+
+    delimiter_token_masks = torch.isin(input_ids, delimiter_tokens)
+    label_groups = torch.cumsum(delimiter_token_masks, dim=1) * (~delimiter_token_masks).to(torch.int32)
+
+    label_maps = ()
+
+    # Iterate over batch dimension as we can have different number of labels
+    for label_group in label_groups:
+        # `label_group` is a tensor of shape `(seq_len,)` with zeros for non-label tokens and integers for label tokens
+        # label tokens with same integer value are part of the same label group
+
+        # Get unique labels and exclude 0 (i.e. non-label tokens)
+        unique_labels = torch.unique(label_group)[1:, None]
+        num_labels = unique_labels.shape[0]
+
+        # Create one-hot encoding for each label group
+        label_map = label_group.unsqueeze(0).repeat(num_labels, 1)
+        label_map = torch.where(label_map == unique_labels, 1, 0)
+
+        # Pad label_map to match `max_seq_len`
+        label_map = F.pad(label_map, (0, max_seq_len - label_map.shape[1]), value=0)
+
+        label_maps += (label_map,)
+
+    return label_maps
+
+
+def build_text_mask(logits, attention_mask):
+    """
+    Create text_mask based on the matching indices
+    """
+    seq_len = attention_mask.shape[1]
+    text_mask = torch.zeros_like(logits, device=logits.device, dtype=attention_mask.dtype)
+    text_mask[:, :, :seq_len] = attention_mask[:, None, :]
+
+    return text_mask.bool()
+
+
+@auto_docstring(
+    custom_intro="""
+    Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top,
+    for tasks such as COCO detection.
+    """
+)
+class MMGroundingDinoForObjectDetection(MMGroundingDinoPreTrainedModel):
+    _tied_weights_keys = [
+        r"bbox_embed\.[1-9]\d*",
+        r"model\.decoder\.bbox_embed\.[0-9]\d*",
+        r"class_embed\.[1-9]\d*",
+        r"model\.decoder\.class_embed\.[0-9]\d*",
+    ]
+
+    def __init__(self, config: MMGroundingDinoConfig):
+        super().__init__(config)
+
+        self.model = MMGroundingDinoModel(config)
+
+        self.class_embed = nn.ModuleList(
+            [MMGroundingDinoContrastiveEmbedding(config) for _ in range(config.decoder_layers)]
+        )
+
+        self.bbox_embed = nn.ModuleList(
+            [
+                MMGroundingDinoMLPPredictionHead(
+                    input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+                )
+                for _ in range(config.decoder_layers)
+            ]
+        )
+
+        # hack for box-refinement
+        self.model.decoder.bbox_embed = self.bbox_embed
+        # hack implementation for two-stage
+        self.model.decoder.class_embed = self.class_embed
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: torch.LongTensor,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        pixel_mask: Optional[torch.BoolTensor] = None,
+        encoder_outputs: Optional[Union[MMGroundingDinoEncoderOutput, tuple]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[list[dict[str, Union[torch.LongTensor, torch.FloatTensor]]]] = None,
+    ):
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`BertTokenizer.__call__`] for details.
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`: 0 corresponds to a `sentence A` token, 1 corresponds to a `sentence B` token
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Examples:
+
+        ```python
+        >>> import requests
+
+        >>> import torch
+        >>> from PIL import Image
+        >>> from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+
+        >>> model_id = "IDEA-Research/grounding-dino-tiny"
+        >>> device = "cuda"
+
+        >>> processor = AutoProcessor.from_pretrained(model_id)
+        >>> model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
+
+        >>> image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(image_url, stream=True).raw)
+        >>> # Check for cats and remote controls
+        >>> text_labels = [["a cat", "a remote control"]]
+
+        >>> inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device)
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> results = processor.post_process_grounded_object_detection(
+        ...     outputs,
+        ...     threshold=0.4,
+        ...     text_threshold=0.3,
+        ...     target_sizes=[(image.height, image.width)]
+        ... )
+        >>> # Retrieve the first image result
+        >>> result = results[0]
+        >>> for box, score, text_label in zip(result["boxes"], result["scores"], result["text_labels"]):
+        ...     box = [round(x, 2) for x in box.tolist()]
+        ...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
+        Detected a cat with confidence 0.479 at location [344.7, 23.11, 637.18, 374.28]
+        Detected a cat with confidence 0.438 at location [12.27, 51.91, 316.86, 472.44]
+        Detected a remote control with confidence 0.478 at location [38.57, 70.0, 176.78, 118.18]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+
+        # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            pixel_mask=pixel_mask,
+            encoder_outputs=encoder_outputs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0)
+        enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx]
+        hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
+        init_reference_points = outputs.init_reference_points if return_dict else outputs[1]
+        inter_references_points = outputs.intermediate_reference_points if return_dict else outputs[3]
+
+        # class logits + predicted bounding boxes
+        outputs_classes = []
+        outputs_coords = []
+
+        # hidden_states are of shape (batch_size, num_stages, height, width)
+        # predict class and bounding box deltas for each stage
+        num_levels = hidden_states.shape[1]
+        for level in range(num_levels):
+            if level == 0:
+                reference = init_reference_points
+            else:
+                reference = inter_references_points[:, level - 1]
+            reference = torch.special.logit(reference, eps=1e-5)
+            outputs_class = self.class_embed[level](
+                vision_hidden_state=hidden_states[:, level],
+                text_hidden_state=enc_text_hidden_state,
+                text_token_mask=attention_mask.bool(),
+            )
+            delta_bbox = self.bbox_embed[level](hidden_states[:, level])
+
+            reference_coordinates = reference.shape[-1]
+            if reference_coordinates == 4:
+                outputs_coord_logits = delta_bbox + reference
+            elif reference_coordinates == 2:
+                delta_bbox[..., :2] += reference
+                outputs_coord_logits = delta_bbox
+            else:
+                raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
+            outputs_coord = outputs_coord_logits.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+        outputs_class = torch.stack(outputs_classes)
+        outputs_coord = torch.stack(outputs_coords)
+
+        logits = outputs_class[-1]
+        pred_boxes = outputs_coord[-1]
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            label_maps = build_label_maps(logits, input_ids)
+            text_mask = build_text_mask(logits, attention_mask)
+            loss, loss_dict, auxiliary_outputs = self.loss_function(
+                logits,
+                labels,
+                self.device,
+                pred_boxes,
+                self.config,
+                label_maps,
+                text_mask,
+                outputs_class=outputs_class,
+                outputs_coord=outputs_coord,
+                encoder_logits=outputs[-2],
+                encoder_pred_boxes=outputs[-1],
+            )
+
+        if not return_dict:
+            auxiliary_outputs = auxiliary_outputs if auxiliary_outputs is not None else []
+            output = [loss, loss_dict, logits, pred_boxes, *auxiliary_outputs, *outputs, input_ids]
+            output = tuple(out for out in output if out is not None)
+            return output
+
+        dict_outputs = MMGroundingDinoObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            last_hidden_state=outputs.last_hidden_state,
+            auxiliary_outputs=auxiliary_outputs,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision,
+            encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text,
+            encoder_vision_hidden_states=outputs.encoder_vision_hidden_states,
+            encoder_text_hidden_states=outputs.encoder_text_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            intermediate_hidden_states=outputs.intermediate_hidden_states,
+            intermediate_reference_points=outputs.intermediate_reference_points,
+            init_reference_points=outputs.init_reference_points,
+            enc_outputs_class=outputs.enc_outputs_class,
+            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+            encoder_logits=outputs.encoder_logits,
+            encoder_pred_boxes=outputs.encoder_pred_boxes,
+            input_ids=input_ids,
+        )
+
+        return dict_outputs
+
+
+__all__ = ["MMGroundingDinoForObjectDetection", "MMGroundingDinoModel", "MMGroundingDinoPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/feature_extraction_mobilenet_v1.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/feature_extraction_mobilenet_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..02a5401bc145996d1126641ee656180a48c92e20
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/feature_extraction_mobilenet_v1.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for MobileNetV1."""
+
+import warnings
+
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_mobilenet_v1 import MobileNetV1ImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class MobileNetV1FeatureExtractor(MobileNetV1ImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class MobileNetV1FeatureExtractor is deprecated and will be removed in version 5 of Transformers."
+            " Please use MobileNetV1ImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["MobileNetV1FeatureExtractor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3e875620e62736ec5de99e5a8e4a434301e0a5b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/configuration_modernbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/configuration_modernbert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37e8f2a42e1a566560ae4dcd5a450ef2fee546e7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/configuration_modernbert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/modeling_modernbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/modeling_modernbert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee064d98823e04301c2bbcbdd77403ff5c28c7fd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/modeling_modernbert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/modular_modernbert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/modular_modernbert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87da02173483c8ee532b4bbea046496e8358b273
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert/__pycache__/modular_modernbert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4610b4f7b15c3cda54da252455dedcadd78f4a5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/configuration_modernbert_decoder.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/configuration_modernbert_decoder.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ff1852b2826e8063f8621b806a122486585646a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/configuration_modernbert_decoder.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/modeling_modernbert_decoder.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/modeling_modernbert_decoder.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a686750bba00a176f27233d60003bb45ad7a7e6d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/modeling_modernbert_decoder.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/modular_modernbert_decoder.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/modular_modernbert_decoder.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dad8a22d0f5130a56ec3f159c878dfea1b18958b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/modernbert_decoder/__pycache__/modular_modernbert_decoder.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moshi/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moshi/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c04edefd66be0b4a7133422250b6c94c6f022c0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moshi/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moshi/__pycache__/configuration_moshi.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moshi/__pycache__/configuration_moshi.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc491dc6be535c47a9ad997fe80f05fd7afc94b4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moshi/__pycache__/configuration_moshi.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eda93241e1a909f870360ceae157257a269eb82f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/configuration_mpnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/configuration_mpnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d92e97405459b602b2520ce4a88624fb98a770c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/configuration_mpnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/modeling_mpnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/modeling_mpnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37b241fd135664f1f5d6f8db6e99bebddc5de0ec
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/modeling_mpnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/modeling_tf_mpnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/modeling_tf_mpnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76449ce4fad7da5e271dc6d25467c395b0a23557
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/modeling_tf_mpnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/tokenization_mpnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/tokenization_mpnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28e472139bdf864cf015cd7b57599f099583a3e1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/tokenization_mpnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/tokenization_mpnet_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/tokenization_mpnet_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33e7be0d01be71c4a4bdd4b4ddd4340f77e98726
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mpnet/__pycache__/tokenization_mpnet_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bbf46adf59857321334a2d1a3685d46278ea848
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/configuration_mt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/configuration_mt5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9319d516d82c67f1f002608532cdc7ef55e47a64
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/configuration_mt5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/modeling_flax_mt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/modeling_flax_mt5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06f8b02301a5308eaf82cd6e3cf015f0c19407e9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/modeling_flax_mt5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/modeling_tf_mt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/modeling_tf_mt5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0326dae38a04abba5d8c7b721f4d862aeb302282
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/modeling_tf_mt5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/tokenization_mt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/tokenization_mt5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e64302a535676ef5e11d047e7ef7d95d23e4d8bb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/tokenization_mt5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/tokenization_mt5_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/tokenization_mt5_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85213459dacef1444c1efb3bacbcafe788bab91a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mt5/__pycache__/tokenization_mt5_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4334a8cefc8e5a8f8218d776b809d9c93911b197
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/configuration_mvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/configuration_mvp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b673d0550e4ac3555b3b8d1d7f7236c2a7ad278b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/configuration_mvp.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/modeling_mvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/modeling_mvp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94c86a256daae3fbf273af5f29bc314b1fc682af
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/modeling_mvp.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/tokenization_mvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/tokenization_mvp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f3b9f599acb760dd4784b02af6c485390f4f88c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/tokenization_mvp.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/tokenization_mvp_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/tokenization_mvp_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4bc41f25bd73128bcfc4079c03e3fdd95cd62fcd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mvp/__pycache__/tokenization_mvp_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a56efc31124f42723bbdedd6117039ee08f5746a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/configuration_nemotron.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/configuration_nemotron.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9dd4c1e38c53cf8ef2c47a31449975d41133d48
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/configuration_nemotron.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/modeling_nemotron.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/modeling_nemotron.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0952cbe50dac0ef9f9cf890c44fffb094af3b518
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nemotron/__pycache__/modeling_nemotron.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d98c0304a17be67d55d90e5073efbe67a068f967
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/configuration_nllb_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/configuration_nllb_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b42ee0744425d09ab189eeb0f3998b129f33bddb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/configuration_nllb_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/modeling_nllb_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/modeling_nllb_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78b4da8000d3315fd55b3ed1842ab511c46e7356
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nllb_moe/__pycache__/modeling_nllb_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ceeb9895278cab27cbc9ea93428dc5cda305a362
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/configuration_olmo2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/configuration_olmo2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9e9bbfd57bf8df1f26d43d030572696736dfe67
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/configuration_olmo2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/modular_olmo2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/modular_olmo2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6788c0fbde0d0dd4654df71bbedc13ec28ebf838
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo2/__pycache__/modular_olmo2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3fd80c2bcc6b268cd47a7afe321364ce3ed1aa5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_owlvit import *
+    from .feature_extraction_owlvit import *
+    from .image_processing_owlvit import *
+    from .image_processing_owlvit_fast import *
+    from .modeling_owlvit import *
+    from .processing_owlvit import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/configuration_owlvit.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/configuration_owlvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4873ff4a08b3f53cfb4dc8cd63ba01e7a2e96c1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/configuration_owlvit.py
@@ -0,0 +1,336 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""OWL-ViT model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Any, Optional
+
+
+if TYPE_CHECKING:
+    from ...processing_utils import ProcessorMixin
+    from ...utils import TensorType
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class OwlViTTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`OwlViTTextModel`]. It is used to instantiate an
+    OwlViT text encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the OwlViT
+    [google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the OWL-ViT text model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`OwlViTTextModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 16):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token in the input sequences.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            The id of the beginning-of-sequence token in the input sequences.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            The id of the end-of-sequence token in the input sequences.
+
+    Example:
+
+    ```python
+    >>> from transformers import OwlViTTextConfig, OwlViTTextModel
+
+    >>> # Initializing a OwlViTTextModel with google/owlvit-base-patch32 style configuration
+    >>> configuration = OwlViTTextConfig()
+
+    >>> # Initializing a OwlViTTextConfig from the google/owlvit-base-patch32 style configuration
+    >>> model = OwlViTTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "owlvit_text_model"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=16,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        pad_token_id=0,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+
+
+class OwlViTVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`OwlViTVisionModel`]. It is used to instantiate
+    an OWL-ViT image encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the OWL-ViT
+    [google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 768):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from transformers import OwlViTVisionConfig, OwlViTVisionModel
+
+    >>> # Initializing a OwlViTVisionModel with google/owlvit-base-patch32 style configuration
+    >>> configuration = OwlViTVisionConfig()
+
+    >>> # Initializing a OwlViTVisionModel model from the google/owlvit-base-patch32 style configuration
+    >>> model = OwlViTVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "owlvit_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=768,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+
+
+class OwlViTConfig(PretrainedConfig):
+    r"""
+    [`OwlViTConfig`] is the configuration class to store the configuration of an [`OwlViTModel`]. It is used to
+    instantiate an OWL-ViT model according to the specified arguments, defining the text model and vision model
+    configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the OWL-ViT
+    [google/owlvit-base-patch32](https://huggingface.co/google/owlvit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`OwlViTTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`OwlViTVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The initial value of the *logit_scale* parameter. Default is used as per the original OWL-ViT
+            implementation.
+        return_dict (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return a dictionary. If `False`, returns a tuple.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    """
+
+    model_type = "owlvit"
+    sub_configs = {"text_config": OwlViTTextConfig, "vision_config": OwlViTVisionConfig}
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        projection_dim=512,
+        logit_scale_init_value=2.6592,
+        return_dict=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the OwlViTTextConfig with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the OwlViTVisionConfig with default values.")
+
+        self.text_config = OwlViTTextConfig(**text_config)
+        self.vision_config = OwlViTVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.return_dict = return_dict
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: dict, vision_config: dict, **kwargs):
+        r"""
+        Instantiate a [`OwlViTConfig`] (or a derived class) from owlvit text model configuration and owlvit vision
+        model configuration.
+
+        Returns:
+            [`OwlViTConfig`]: An instance of a configuration object
+        """
+        config_dict = {}
+        config_dict["text_config"] = text_config
+        config_dict["vision_config"] = vision_config
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class OwlViTOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("logits_per_image", {0: "batch"}),
+                ("logits_per_text", {0: "batch"}),
+                ("text_embeds", {0: "batch"}),
+                ("image_embeds", {0: "batch"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+    def generate_dummy_inputs(
+        self,
+        processor: "ProcessorMixin",
+        batch_size: int = -1,
+        seq_length: int = -1,
+        framework: Optional["TensorType"] = None,
+    ) -> Mapping[str, Any]:
+        text_input_dict = super().generate_dummy_inputs(
+            processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
+        )
+        image_input_dict = super().generate_dummy_inputs(
+            processor.image_processor, batch_size=batch_size, framework=framework
+        )
+        return {**text_input_dict, **image_input_dict}
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 14
+
+
+__all__ = ["OwlViTConfig", "OwlViTOnnxConfig", "OwlViTTextConfig", "OwlViTVisionConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/feature_extraction_owlvit.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/feature_extraction_owlvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee3a8d0b145cc54e8ccebf7acc84538236fac644
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/feature_extraction_owlvit.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for OwlViT."""
+
+import warnings
+
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_owlvit import OwlViTImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class OwlViTFeatureExtractor(OwlViTImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class OwlViTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please"
+            " use OwlViTImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["OwlViTFeatureExtractor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/image_processing_owlvit.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/image_processing_owlvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc9c6cfdeaa8aa0532539f03e3e8e1e1aa3d9619
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/image_processing_owlvit.py
@@ -0,0 +1,625 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for OwlViT"""
+
+import warnings
+from typing import TYPE_CHECKING, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    center_crop,
+    center_to_corners_format,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_torch_available, logging
+from ...utils.import_utils import requires
+
+
+if TYPE_CHECKING:
+    from .modeling_owlvit import OwlViTObjectDetectionOutput
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+def _upcast(t):
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+def _scale_boxes(boxes, target_sizes):
+    """
+    Scale batch of bounding boxes to the target sizes.
+
+    Args:
+        boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
+            Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
+        target_sizes (`list[tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
+            Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.
+
+    Returns:
+        `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
+    """
+
+    if isinstance(target_sizes, (list, tuple)):
+        image_height = torch.tensor([i[0] for i in target_sizes])
+        image_width = torch.tensor([i[1] for i in target_sizes])
+    elif isinstance(target_sizes, torch.Tensor):
+        image_height, image_width = target_sizes.unbind(1)
+    else:
+        raise TypeError("`target_sizes` must be a list, tuple or torch.Tensor")
+
+    scale_factor = torch.stack([image_width, image_height, image_width, image_height], dim=1)
+    scale_factor = scale_factor.unsqueeze(1).to(boxes.device)
+    boxes = boxes * scale_factor
+    return boxes
+
+
+def box_area(boxes):
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
+
+    Args:
+        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
+            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
+            < x2` and `0 <= y1 < y2`.
+    Returns:
+        `torch.FloatTensor`: a tensor containing the area for each box.
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
+    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+@requires(backends=("vision",))
+class OwlViTImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs an OWL-ViT image processor.
+
+    This image processor inherits from [`ImageProcessingMixin`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the shorter edge of the input to a certain `size`.
+        size (`dict[str, int]`, *optional*, defaults to {"height": 768, "width": 768}):
+            The size to use for resizing the image. Only has an effect if `do_resize` is set to `True`. If `size` is a
+            sequence like (h, w), output size will be matched to this. If `size` is an int, then image will be resized
+            to (size, size).
+        resample (`int`, *optional*, defaults to `Resampling.BICUBIC`):
+            An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
+            `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
+            `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
+            to `True`.
+        do_center_crop (`bool`, *optional*, defaults to `False`):
+            Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge, the
+            image is padded with 0's and then center cropped.
+        crop_size (`int`, *optional*, defaults to {"height": 768, "width": 768}):
+            The size to use for center cropping the image. Only has an effect if `do_center_crop` is set to `True`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the input by a certain factor.
+        rescale_factor (`float`, *optional*, defaults to `1/255`):
+            The factor to use for rescaling the image. Only has an effect if `do_rescale` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with `image_mean` and `image_std`. Desired output size when applying
+            center-cropping. Only has an effect if `do_center_crop` is set to `True`.
+        image_mean (`list[int]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            The sequence of means for each channel, to be used when normalizing images.
+        image_std (`list[int]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize=True,
+        size=None,
+        resample=PILImageResampling.BICUBIC,
+        do_center_crop=False,
+        crop_size=None,
+        do_rescale=True,
+        rescale_factor=1 / 255,
+        do_normalize=True,
+        image_mean=None,
+        image_std=None,
+        **kwargs,
+    ):
+        size = size if size is not None else {"height": 768, "width": 768}
+        size = get_size_dict(size, default_to_square=True)
+
+        crop_size = crop_size if crop_size is not None else {"height": 768, "width": 768}
+        crop_size = get_size_dict(crop_size, default_to_square=True)
+
+        # Early versions of the OWL-ViT config on the hub had "rescale" as a flag. This clashes with the
+        # vision image processor method `rescale` as it would be set as an attribute during the super().__init__
+        # call. This is for backwards compatibility.
+        if "rescale" in kwargs:
+            rescale_val = kwargs.pop("rescale")
+            kwargs["do_rescale"] = rescale_val
+
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to a certain size.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                The size to resize the image to. Must contain height and width keys.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                The resampling filter to use when resizing the input.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size, default_to_square=True)
+        if "height" not in size or "width" not in size:
+            raise ValueError("size dictionary must contain height and width keys")
+
+        return resize(
+            image,
+            (size["height"], size["width"]),
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def center_crop(
+        self,
+        image: np.ndarray,
+        crop_size: dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Center crop an image to a certain size.
+
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            crop_size (`dict[str, int]`):
+                The size to center crop the image to. Must contain height and width keys.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        crop_size = get_size_dict(crop_size, default_to_square=True)
+        if "height" not in crop_size or "width" not in crop_size:
+            raise ValueError("crop_size dictionary must contain height and width keys")
+
+        return center_crop(
+            image,
+            (crop_size["height"], crop_size["width"]),
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale
+    def rescale(
+        self,
+        image: np.ndarray,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> BatchFeature:
+        """
+        Prepares an image or batch of images for the model.
+
+        Args:
+            images (`ImageInput`):
+                The image or batch of images to be prepared. Expects a single or batch of images with pixel values
+                ranging from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether or not to resize the input. If `True`, will resize the input to the size specified by `size`.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                The size to resize the input to. Only has an effect if `do_resize` is set to `True`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                The resampling filter to use when resizing the input. Only has an effect if `do_resize` is set to
+                `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether or not to center crop the input. If `True`, will center crop the input to the size specified by
+                `crop_size`.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                The size to center crop the input to. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether or not to rescale the input. If `True`, will rescale the input by dividing it by
+                `rescale_factor`.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                The factor to rescale the input by. Only has an effect if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether or not to normalize the input. If `True`, will normalize the input by subtracting `image_mean`
+                and dividing by `image_std`.
+            image_mean (`Union[float, list[float]]`, *optional*, defaults to `self.image_mean`):
+                The mean to subtract from the input when normalizing. Only has an effect if `do_normalize` is set to
+                `True`.
+            image_std (`Union[float, list[float]]`, *optional*, defaults to `self.image_std`):
+                The standard deviation to divide the input by when normalizing. Only has an effect if `do_normalize` is
+                set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        # All transformations expect numpy arrays
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_center_crop:
+            images = [
+                self.center_crop(image, crop_size=crop_size, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image, rescale_factor=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+        encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+        return encoded_inputs
+
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format.
+
+        Args:
+            outputs ([`OwlViTObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation). For visualization, this should be the image size after data
+                augment, but before padding.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        # TODO: (amy) add support for other frameworks
+        warnings.warn(
+            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+            FutureWarning,
+        )
+
+        logits, boxes = outputs.logits, outputs.pred_boxes
+
+        if len(logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        probs = torch.max(logits, dim=-1)
+        scores = torch.sigmoid(probs.values)
+        labels = probs.indices
+
+        # Convert to [x0, y0, x1, y1] format
+        boxes = center_to_corners_format(boxes)
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+    def post_process_object_detection(
+        self,
+        outputs: "OwlViTObjectDetectionOutput",
+        threshold: float = 0.1,
+        target_sizes: Optional[Union[TensorType, list[tuple]]] = None,
+    ):
+        """
+        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format.
+
+        Args:
+            outputs ([`OwlViTObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.1):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the following keys:
+            - "scores": The confidence scores for each predicted box on the image.
+            - "labels": Indexes of the classes predicted by the model on the image.
+            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
+        """
+        batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
+        batch_size = len(batch_logits)
+
+        if target_sizes is not None and len(target_sizes) != batch_size:
+            raise ValueError("Make sure that you pass in as many target sizes as images")
+
+        # batch_logits of shape (batch_size, num_queries, num_classes)
+        batch_class_logits = torch.max(batch_logits, dim=-1)
+        batch_scores = torch.sigmoid(batch_class_logits.values)
+        batch_labels = batch_class_logits.indices
+
+        # Convert to [x0, y0, x1, y1] format
+        batch_boxes = center_to_corners_format(batch_boxes)
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            batch_boxes = _scale_boxes(batch_boxes, target_sizes)
+
+        results = []
+        for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
+            keep = scores > threshold
+            scores = scores[keep]
+            labels = labels[keep]
+            boxes = boxes[keep]
+            results.append({"scores": scores, "labels": labels, "boxes": boxes})
+
+        return results
+
+    def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_threshold=0.3, target_sizes=None):
+        """
+        Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
+        api.
+
+        Args:
+            outputs ([`OwlViTImageGuidedObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.0):
+                Minimum confidence threshold to use to filter out predicted boxes.
+            nms_threshold (`float`, *optional*, defaults to 0.3):
+                IoU threshold for non-maximum suppression of overlapping boxes.
+            target_sizes (`torch.Tensor`, *optional*):
+                Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
+                the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
+                None, predictions will not be unnormalized.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model. All labels are set to None as
+            `OwlViTForObjectDetection.image_guided_detection` perform one-shot object detection.
+        """
+        logits, target_boxes = outputs.logits, outputs.target_pred_boxes
+
+        if target_sizes is not None and len(logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes is not None and target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        probs = torch.max(logits, dim=-1)
+        scores = torch.sigmoid(probs.values)
+
+        # Convert to [x0, y0, x1, y1] format
+        target_boxes = center_to_corners_format(target_boxes)
+
+        # Apply non-maximum suppression (NMS)
+        if nms_threshold < 1.0:
+            for idx in range(target_boxes.shape[0]):
+                for i in torch.argsort(-scores[idx]):
+                    if not scores[idx][i]:
+                        continue
+
+                    ious = box_iou(target_boxes[idx][i, :].unsqueeze(0), target_boxes[idx])[0][0]
+                    ious[i] = -1.0  # Mask self-IoU.
+                    scores[idx][ious > nms_threshold] = 0.0
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            target_boxes = _scale_boxes(target_boxes, target_sizes)
+
+        # Compute box display alphas based on prediction scores
+        results = []
+        alphas = torch.zeros_like(scores)
+
+        for idx in range(target_boxes.shape[0]):
+            # Select scores for boxes matching the current query:
+            query_scores = scores[idx]
+            if not query_scores.nonzero().numel():
+                continue
+
+            # Apply threshold on scores before scaling
+            query_scores[query_scores < threshold] = 0.0
+
+            # Scale box alpha such that the best box for each query has alpha 1.0 and the worst box has alpha 0.1.
+            # All other boxes will either belong to a different query, or will not be shown.
+            max_score = torch.max(query_scores) + 1e-6
+            query_alphas = (query_scores - (max_score * 0.1)) / (max_score * 0.9)
+            query_alphas = torch.clip(query_alphas, 0.0, 1.0)
+            alphas[idx] = query_alphas
+
+            mask = alphas[idx] > 0
+            box_scores = alphas[idx][mask]
+            boxes = target_boxes[idx][mask]
+            results.append({"scores": box_scores, "labels": None, "boxes": boxes})
+
+        return results
+
+
+__all__ = ["OwlViTImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/image_processing_owlvit_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/image_processing_owlvit_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e458f964a04cd9bf12712f82f4691be374d497d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/image_processing_owlvit_fast.py
@@ -0,0 +1,231 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for OwlViT"""
+
+import warnings
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+
+from ...image_processing_utils_fast import BaseImageProcessorFast
+from ...image_transforms import center_to_corners_format
+from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling
+from ...utils import TensorType, auto_docstring, logging
+from .image_processing_owlvit import _scale_boxes, box_iou
+
+
+if TYPE_CHECKING:
+    from .modeling_owlvit import OwlViTObjectDetectionOutput
+
+
+logger = logging.get_logger(__name__)
+
+
+@auto_docstring
+class OwlViTImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"height": 768, "width": 768}
+    default_to_square = True
+    crop_size = {"height": 768, "width": 768}
+    do_resize = True
+    do_center_crop = False
+    do_rescale = True
+    do_normalize = None
+    do_convert_rgb = None
+    model_input_names = ["pixel_values"]
+
+    # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process
+    def post_process(self, outputs, target_sizes):
+        """
+        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format.
+
+        Args:
+            outputs ([`OwlViTObjectDetectionOutput`]):
+                Raw outputs of the model.
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
+                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
+                image size (before any data augmentation). For visualization, this should be the image size after data
+                augment, but before padding.
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        # TODO: (amy) add support for other frameworks
+        warnings.warn(
+            "`post_process` is deprecated and will be removed in v5 of Transformers, please use"
+            " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.",
+            FutureWarning,
+        )
+
+        logits, boxes = outputs.logits, outputs.pred_boxes
+
+        if len(logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        probs = torch.max(logits, dim=-1)
+        scores = torch.sigmoid(probs.values)
+        labels = probs.indices
+
+        # Convert to [x0, y0, x1, y1] format
+        boxes = center_to_corners_format(boxes)
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+    # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection
+    def post_process_object_detection(
+        self,
+        outputs: "OwlViTObjectDetectionOutput",
+        threshold: float = 0.1,
+        target_sizes: Optional[Union[TensorType, list[tuple]]] = None,
+    ):
+        """
+        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format.
+
+        Args:
+            outputs ([`OwlViTObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.1):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the following keys:
+            - "scores": The confidence scores for each predicted box on the image.
+            - "labels": Indexes of the classes predicted by the model on the image.
+            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
+        """
+        batch_logits, batch_boxes = outputs.logits, outputs.pred_boxes
+        batch_size = len(batch_logits)
+
+        if target_sizes is not None and len(target_sizes) != batch_size:
+            raise ValueError("Make sure that you pass in as many target sizes as images")
+
+        # batch_logits of shape (batch_size, num_queries, num_classes)
+        batch_class_logits = torch.max(batch_logits, dim=-1)
+        batch_scores = torch.sigmoid(batch_class_logits.values)
+        batch_labels = batch_class_logits.indices
+
+        # Convert to [x0, y0, x1, y1] format
+        batch_boxes = center_to_corners_format(batch_boxes)
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            batch_boxes = _scale_boxes(batch_boxes, target_sizes)
+
+        results = []
+        for scores, labels, boxes in zip(batch_scores, batch_labels, batch_boxes):
+            keep = scores > threshold
+            scores = scores[keep]
+            labels = labels[keep]
+            boxes = boxes[keep]
+            results.append({"scores": scores, "labels": labels, "boxes": boxes})
+
+        return results
+
+    # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_image_guided_detection
+    def post_process_image_guided_detection(self, outputs, threshold=0.0, nms_threshold=0.3, target_sizes=None):
+        """
+        Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
+        api.
+
+        Args:
+            outputs ([`OwlViTImageGuidedObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.0):
+                Minimum confidence threshold to use to filter out predicted boxes.
+            nms_threshold (`float`, *optional*, defaults to 0.3):
+                IoU threshold for non-maximum suppression of overlapping boxes.
+            target_sizes (`torch.Tensor`, *optional*):
+                Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
+                the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
+                None, predictions will not be unnormalized.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model. All labels are set to None as
+            `OwlViTForObjectDetection.image_guided_detection` perform one-shot object detection.
+        """
+        logits, target_boxes = outputs.logits, outputs.target_pred_boxes
+
+        if target_sizes is not None and len(logits) != len(target_sizes):
+            raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        if target_sizes is not None and target_sizes.shape[1] != 2:
+            raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch")
+
+        probs = torch.max(logits, dim=-1)
+        scores = torch.sigmoid(probs.values)
+
+        # Convert to [x0, y0, x1, y1] format
+        target_boxes = center_to_corners_format(target_boxes)
+
+        # Apply non-maximum suppression (NMS)
+        if nms_threshold < 1.0:
+            for idx in range(target_boxes.shape[0]):
+                for i in torch.argsort(-scores[idx]):
+                    if not scores[idx][i]:
+                        continue
+
+                    ious = box_iou(target_boxes[idx][i, :].unsqueeze(0), target_boxes[idx])[0][0]
+                    ious[i] = -1.0  # Mask self-IoU.
+                    scores[idx][ious > nms_threshold] = 0.0
+
+        # Convert from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            target_boxes = _scale_boxes(target_boxes, target_sizes)
+
+        # Compute box display alphas based on prediction scores
+        results = []
+        alphas = torch.zeros_like(scores)
+
+        for idx in range(target_boxes.shape[0]):
+            # Select scores for boxes matching the current query:
+            query_scores = scores[idx]
+            if not query_scores.nonzero().numel():
+                continue
+
+            # Apply threshold on scores before scaling
+            query_scores[query_scores < threshold] = 0.0
+
+            # Scale box alpha such that the best box for each query has alpha 1.0 and the worst box has alpha 0.1.
+            # All other boxes will either belong to a different query, or will not be shown.
+            max_score = torch.max(query_scores) + 1e-6
+            query_alphas = (query_scores - (max_score * 0.1)) / (max_score * 0.9)
+            query_alphas = torch.clip(query_alphas, 0.0, 1.0)
+            alphas[idx] = query_alphas
+
+            mask = alphas[idx] > 0
+            box_scores = alphas[idx][mask]
+            boxes = target_boxes[idx][mask]
+            results.append({"scores": box_scores, "labels": None, "boxes": boxes})
+
+        return results
+
+
+__all__ = ["OwlViTImageProcessorFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/modeling_owlvit.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/modeling_owlvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..3971b1376d9c10b945fa17fd9f23fcba5ff65e38
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/modeling_owlvit.py
@@ -0,0 +1,1642 @@
+# coding=utf-8
+# Copyright 2022 Google AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OWL-ViT model."""
+
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Any, Optional, Union
+
+import torch
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    auto_docstring,
+    filter_out_non_signature_kwargs,
+    is_vision_available,
+    logging,
+    torch_int,
+)
+from .configuration_owlvit import OwlViTConfig, OwlViTTextConfig, OwlViTVisionConfig
+
+
+if is_vision_available():
+    from transformers.image_transforms import center_to_corners_format
+
+
+logger = logging.get_logger(__name__)
+
+
+# See all OwlViT models at https://huggingface.co/models?filter=owlvit
+
+
+# Copied from transformers.models.clip.modeling_clip.contrastive_loss with clip->owlvit
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
+
+
+# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->owlvit
+def owlvit_loss(similarity: torch.Tensor) -> torch.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.t())
+    return (caption_loss + image_loss) / 2.0
+
+
+@dataclass
+@auto_docstring
+class OwlViTOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+        Contrastive loss for image-text similarity.
+    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+        similarity scores.
+    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+        similarity scores.
+    text_embeds (`torch.FloatTensor` of shape `(batch_size * num_max_text_queries, output_dim`):
+        The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`].
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+        The image embeddings obtained by applying the projection layer to the pooled output of
+        [`OwlViTVisionModel`].
+    text_model_output (tuple[`BaseModelOutputWithPooling`]):
+        The output of the [`OwlViTTextModel`].
+    vision_model_output (`BaseModelOutputWithPooling`):
+        The output of the [`OwlViTVisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# Copied from transformers.loss.loss_for_object_detection._upcast
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+# Copied from transformers.loss.loss_for_object_detection.box_area
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
+
+    Args:
+        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
+            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
+            < x2` and `0 <= y1 < y2`.
+
+    Returns:
+        `torch.FloatTensor`: a tensor containing the area for each box.
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+# Copied from transformers.loss.loss_for_object_detection.box_iou
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
+    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+# Copied from transformers.loss.loss_for_object_detection.generalized_box_iou
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
+
+    Returns:
+        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
+        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
+    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
+        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
+    iou, union = box_iou(boxes1, boxes2)
+
+    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
+    area = width_height[:, :, 0] * width_height[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`OwlViTForObjectDetection`].
+    """
+)
+class OwlViTObjectDetectionOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+        scale-invariant IoU loss.
+    loss_dict (`Dict`, *optional*):
+        A dictionary containing the individual losses. Useful for logging.
+    logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
+        Classification logits (including no-object) for all queries.
+    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
+        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+        possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to retrieve the
+        unnormalized bounding boxes.
+    text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
+        The text embeddings obtained by applying the projection layer to the pooled output of [`OwlViTTextModel`].
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
+        Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
+        image embeddings for each patch.
+    class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
+        Class embeddings of all image patches. OWL-ViT represents images as a set of image patches where the total
+        number of patches is (image_size / patch_size)**2.
+    text_model_output (tuple[`BaseModelOutputWithPooling`]):
+        The output of the [`OwlViTTextModel`].
+    vision_model_output (`BaseModelOutputWithPooling`):
+        The output of the [`OwlViTVisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[dict] = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    class_embeds: Optional[torch.FloatTensor] = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`OwlViTForObjectDetection.image_guided_detection`].
+    """
+)
+class OwlViTImageGuidedObjectDetectionOutput(ModelOutput):
+    r"""
+    logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
+        Classification logits (including no-object) for all queries.
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
+        Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
+        image embeddings for each patch.
+    query_image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
+        Pooled output of [`OwlViTVisionModel`]. OWL-ViT represents images as a set of image patches and computes
+        image embeddings for each patch.
+    target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
+        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+        values are normalized in [0, 1], relative to the size of each individual target image in the batch
+        (disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
+        retrieve the unnormalized bounding boxes.
+    query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
+        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+        values are normalized in [0, 1], relative to the size of each individual query image in the batch
+        (disregarding possible padding). You can use [`~OwlViTImageProcessor.post_process_object_detection`] to
+        retrieve the unnormalized bounding boxes.
+    class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
+        Class embeddings of all image patches. OWL-ViT represents images as a set of image patches where the total
+        number of patches is (image_size / patch_size)**2.
+    text_model_output (tuple[`BaseModelOutputWithPooling`]):
+        The output of the [`OwlViTTextModel`].
+    vision_model_output (`BaseModelOutputWithPooling`):
+        The output of the [`OwlViTVisionModel`].
+    """
+
+    logits: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    query_image_embeds: Optional[torch.FloatTensor] = None
+    target_pred_boxes: Optional[torch.FloatTensor] = None
+    query_pred_boxes: Optional[torch.FloatTensor] = None
+    class_embeds: Optional[torch.FloatTensor] = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class OwlViTVisionEmbeddings(nn.Module):
+    def __init__(self, config: OwlViTVisionConfig):
+        super().__init__()
+        self.patch_size = config.patch_size
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.class_embedding = nn.Parameter(torch.randn(config.hidden_size))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=config.patch_size,
+            stride=config.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = (config.image_size // config.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings.interpolate_pos_encoding
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        position_embedding = self.position_embedding.weight.unsqueeze(0)
+        num_positions = position_embedding.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embedding(self.position_ids)
+
+        class_pos_embed = position_embedding[:, :1]
+        patch_pos_embed = position_embedding[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [batch_size, num_channels, height, width]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+class OwlViTTextEmbeddings(nn.Module):
+    def __init__(self, config: OwlViTTextConfig):
+        super().__init__()
+        self.token_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+class OwlViTAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        # For int8 compatibility, sometimes the `attn_probs` are in `fp32`
+        attn_probs = attn_probs.to(value_states.dtype)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->OwlViT
+class OwlViTMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoderLayer with AltCLIP->OwlViT
+class OwlViTEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: OwlViTConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = OwlViTAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = OwlViTMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+@auto_docstring
+class OwlViTPreTrainedModel(PreTrainedModel):
+    config: OwlViTConfig
+    base_model_prefix = "owlvit"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["OwlViTEncoderLayer"]
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, OwlViTTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, OwlViTVisionEmbeddings):
+            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, OwlViTAttention):
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, OwlViTMLP):
+            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, OwlViTModel):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * factor,
+            )
+            module.logit_scale.data.fill_(self.config.logit_scale_init_value)
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=factor)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+class OwlViTEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`OwlViTEncoderLayer`].
+
+    Args:
+        config: OwlViTConfig
+    """
+
+    def __init__(self, config: OwlViTConfig):
+        super().__init__()
+        self.layers = nn.ModuleList([OwlViTEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`).
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class OwlViTTextTransformer(nn.Module):
+    def __init__(self, config: OwlViTTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = OwlViTTextEmbeddings(config)
+        self.encoder = OwlViTEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        # num_samples, seq_len = input_shape  where num_samples = batch_size * num_max_text_queries
+        # OWLVIT's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_shape, hidden_states.dtype, device=hidden_states.device
+        )
+        # expand attention_mask
+        if attention_mask is not None:
+            # [num_samples, seq_len] -> [num_samples, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # take features from the end of tokens embedding (end of token is the highest number in each sequence)
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+            input_ids.to(torch.int).argmax(dim=-1).to(last_hidden_state.device),
+        ]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class OwlViTTextModel(OwlViTPreTrainedModel):
+    config: OwlViTTextConfig
+
+    def __init__(self, config: OwlViTTextConfig):
+        super().__init__(config)
+        self.text_model = OwlViTTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+
+        Examples:
+        ```python
+        >>> from transformers import AutoProcessor, OwlViTTextModel
+
+        >>> model = OwlViTTextModel.from_pretrained("google/owlvit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
+        >>> inputs = processor(
+        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
+        ... )
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+
+        # Get embeddings for all text queries in all batch samples
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class OwlViTVisionTransformer(nn.Module):
+    def __init__(self, config: OwlViTVisionConfig):
+        super().__init__()
+        self.config = config
+
+        self.embeddings = OwlViTVisionEmbeddings(config)
+        self.pre_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.encoder = OwlViTEncoder(config)
+        self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Cast the input to the expected `dtype`
+        expected_input_dtype = self.embeddings.patch_embedding.weight.dtype
+        pixel_values = pixel_values.to(expected_input_dtype)
+
+        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        hidden_states = self.pre_layernorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class OwlViTVisionModel(OwlViTPreTrainedModel):
+    config: OwlViTVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: OwlViTVisionConfig):
+        super().__init__(config)
+        self.vision_model = OwlViTVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, OwlViTVisionModel
+
+        >>> model = OwlViTVisionModel.from_pretrained("google/owlvit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+
+@auto_docstring
+class OwlViTModel(OwlViTPreTrainedModel):
+    config: OwlViTConfig
+
+    def __init__(self, config: OwlViTConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, OwlViTTextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type OwlViTTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, OwlViTVisionConfig):
+            raise TypeError(
+                "config.vision_config is expected to be of type OwlViTVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = OwlViTTextTransformer(text_config)
+        self.vision_model = OwlViTVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.tensor(config.logit_scale_init_value))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @filter_out_non_signature_kwargs()
+    @auto_docstring
+    def get_text_features(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`OwlViTTextModel`].
+
+        Examples:
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, OwlViTModel
+
+        >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
+        >>> inputs = processor(
+        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
+        ... )
+        >>> with torch.inference_mode():
+        ...     text_features = model.get_text_features(**inputs)
+        ```"""
+        # Get embeddings for all text queries in all batch samples
+        text_outputs: BaseModelOutputWithPooling = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
+        text_features = self.text_projection(text_outputs.pooler_output)
+
+        return text_features
+
+    @filter_out_non_signature_kwargs()
+    @auto_docstring
+    def get_image_features(
+        self,
+        pixel_values: torch.Tensor,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`OwlViTVisionModel`].
+
+        Examples:
+        ```python
+        >>> import torch
+        >>> from transformers.image_utils import load_image
+        >>> from transformers import AutoProcessor, OwlViTModel
+
+        >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = load_image(url)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> with torch.inference_mode():
+        ...     image_features = model.get_image_features(**inputs)
+        ```"""
+        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+        image_features = self.visual_projection(vision_outputs.pooler_output)
+
+        return image_features
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_base_image_embeds: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, OwlViTOutput]:
+        r"""
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        return_base_image_embeds (`bool`, *optional*):
+            Whether or not to return the base image embeddings.
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, OwlViTModel
+
+        >>> model = OwlViTModel.from_pretrained("google/owlvit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(text=[["a photo of a cat", "a photo of a dog"]], images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use OWL-ViT model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        # Get embeddings for all text queries in all batch samples
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / torch.linalg.norm(image_embeds, ord=2, dim=-1, keepdim=True)
+        text_embeds_norm = text_embeds / torch.linalg.norm(text_embeds, ord=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits and set it on the correct device
+        logit_scale = self.logit_scale.exp().to(image_embeds.device)
+
+        logits_per_text = torch.matmul(text_embeds_norm, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = owlvit_loss(logits_per_text)
+
+        text_embeds = text_embeds_norm
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return OwlViTOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+class OwlViTBoxPredictionHead(nn.Module):
+    def __init__(self, config: OwlViTConfig, out_dim: int = 4):
+        super().__init__()
+
+        width = config.vision_config.hidden_size
+        self.dense0 = nn.Linear(width, width)
+        self.dense1 = nn.Linear(width, width)
+        self.gelu = nn.GELU()
+        self.dense2 = nn.Linear(width, out_dim)
+
+    def forward(self, image_features: torch.Tensor) -> torch.FloatTensor:
+        output = self.dense0(image_features)
+        output = self.gelu(output)
+        output = self.dense1(output)
+        output = self.gelu(output)
+        output = self.dense2(output)
+        return output
+
+
+class OwlViTClassPredictionHead(nn.Module):
+    def __init__(self, config: OwlViTConfig):
+        super().__init__()
+
+        out_dim = config.text_config.hidden_size
+        self.query_dim = config.vision_config.hidden_size
+
+        self.dense0 = nn.Linear(self.query_dim, out_dim)
+        self.logit_shift = nn.Linear(self.query_dim, 1)
+        self.logit_scale = nn.Linear(self.query_dim, 1)
+        self.elu = nn.ELU()
+
+    def forward(
+        self,
+        image_embeds: torch.FloatTensor,
+        query_embeds: Optional[torch.FloatTensor],
+        query_mask: Optional[torch.Tensor],
+    ) -> tuple[torch.FloatTensor]:
+        image_class_embeds = self.dense0(image_embeds)
+        if query_embeds is None:
+            device = image_class_embeds.device
+            batch_size, num_patches = image_class_embeds.shape[:2]
+            pred_logits = torch.zeros((batch_size, num_patches, self.query_dim)).to(device)
+            return (pred_logits, image_class_embeds)
+
+        # Normalize image and text features
+        image_class_embeds = image_class_embeds / (torch.linalg.norm(image_class_embeds, dim=-1, keepdim=True) + 1e-6)
+        query_embeds = query_embeds / (torch.linalg.norm(query_embeds, dim=-1, keepdim=True) + 1e-6)
+
+        # Get class predictions
+        pred_logits = torch.einsum("...pd,...qd->...pq", image_class_embeds, query_embeds)
+
+        # Apply a learnable shift and scale to logits
+        logit_shift = self.logit_shift(image_embeds)
+        logit_scale = self.logit_scale(image_embeds)
+        logit_scale = self.elu(logit_scale) + 1
+        pred_logits = (pred_logits + logit_shift) * logit_scale
+
+        if query_mask is not None:
+            if query_mask.ndim > 1:
+                query_mask = torch.unsqueeze(query_mask, dim=-2)
+
+            pred_logits = torch.where(query_mask == 0, torch.finfo(pred_logits.dtype).min, pred_logits)
+            pred_logits = pred_logits.to(torch.float32)
+
+        return (pred_logits, image_class_embeds)
+
+
+class OwlViTForObjectDetection(OwlViTPreTrainedModel):
+    config: OwlViTConfig
+
+    def __init__(self, config: OwlViTConfig):
+        super().__init__(config)
+
+        self.owlvit = OwlViTModel(config)
+        self.class_head = OwlViTClassPredictionHead(config)
+        self.box_head = OwlViTBoxPredictionHead(config)
+
+        self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps)
+        self.sigmoid = nn.Sigmoid()
+        self.config = config
+        self.num_patches_height = self.config.vision_config.image_size // self.config.vision_config.patch_size
+        self.num_patches_width = self.config.vision_config.image_size // self.config.vision_config.patch_size
+        self.box_bias = self.compute_box_bias(self.num_patches_height, self.num_patches_width)
+
+    @staticmethod
+    def normalize_grid_corner_coordinates(num_patches_height: int, num_patches_width: int) -> torch.Tensor:
+        # Create grid coordinates using torch
+        x_coordinates = torch.arange(1, num_patches_width + 1, dtype=torch.float32)
+        y_coordinates = torch.arange(1, num_patches_height + 1, dtype=torch.float32)
+        xx, yy = torch.meshgrid(x_coordinates, y_coordinates, indexing="xy")
+
+        # Stack the coordinates and divide by their respective patch counts
+        box_coordinates = torch.stack((xx, yy), dim=-1)
+        box_coordinates[..., 0] /= num_patches_width
+        box_coordinates[..., 1] /= num_patches_height
+
+        # Flatten (h, w, 2) -> (h*w, 2)
+        box_coordinates = box_coordinates.view(-1, 2)
+
+        return box_coordinates
+
+    @lru_cache(maxsize=2)
+    def compute_box_bias(
+        self, num_patches_height: int, num_patches_width: int, feature_map: Optional[torch.FloatTensor] = None
+    ) -> torch.Tensor:
+        if feature_map is not None:
+            raise ValueError("feature_map has been deprecated as an input. Please pass in num_patches instead")
+        # The box center is biased to its position on the feature grid
+        box_coordinates = self.normalize_grid_corner_coordinates(num_patches_height, num_patches_width)
+        box_coordinates = torch.clip(box_coordinates, 0.0, 1.0)
+
+        # Unnormalize xy
+        box_coord_bias = torch.log(box_coordinates + 1e-4) - torch.log1p(-box_coordinates + 1e-4)
+
+        # The box size is biased to the patch size
+        box_size = torch.full_like(box_coord_bias, 1.0)
+        box_size[..., 0] /= num_patches_width
+        box_size[..., 1] /= num_patches_height
+        box_size_bias = torch.log(box_size + 1e-4) - torch.log1p(-box_size + 1e-4)
+
+        # Compute box bias
+        box_bias = torch.cat([box_coord_bias, box_size_bias], dim=-1)
+        return box_bias
+
+    def box_predictor(
+        self,
+        image_feats: torch.FloatTensor,
+        feature_map: torch.FloatTensor,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            image_feats:
+                Features extracted from the image, returned by the `image_text_embedder` method.
+            feature_map:
+                A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method.
+            interpolate_pos_encoding:
+                Whether to interpolate the pre-trained position encodings.
+        Returns:
+            pred_boxes:
+                List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary.
+        """
+        # Bounding box detection head [batch_size, num_boxes, 4].
+        pred_boxes = self.box_head(image_feats)
+
+        # Compute the location of each token on the grid and use it to compute a bias for the bbox prediction
+        if interpolate_pos_encoding:
+            _, num_patches_height, num_patches_width, _ = feature_map.shape
+            box_bias = self.compute_box_bias(num_patches_height, num_patches_width)
+        else:
+            box_bias = self.box_bias
+
+        box_bias = box_bias.to(feature_map.device)
+        pred_boxes += box_bias
+        pred_boxes = self.sigmoid(pred_boxes)
+        return pred_boxes
+
+    def class_predictor(
+        self,
+        image_feats: torch.FloatTensor,
+        query_embeds: Optional[torch.FloatTensor] = None,
+        query_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            image_feats:
+                Features extracted from the `image_text_embedder`.
+            query_embeds:
+                Text query embeddings.
+            query_mask:
+                Must be provided with query_embeddings. A mask indicating which query embeddings are valid.
+        """
+        (pred_logits, image_class_embeds) = self.class_head(image_feats, query_embeds, query_mask)
+
+        return (pred_logits, image_class_embeds)
+
+    def image_text_embedder(
+        self,
+        input_ids: torch.Tensor,
+        pixel_values: torch.FloatTensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> tuple[torch.FloatTensor]:
+        # Encode text and image
+        outputs = self.owlvit(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=True,
+        )
+
+        if interpolate_pos_encoding:
+            _, _, height, width = pixel_values.shape
+            num_patches_height = height // self.config.vision_config.patch_size
+            num_patches_width = width // self.config.vision_config.patch_size
+        else:
+            num_patches_height = self.num_patches_height
+            num_patches_width = self.num_patches_width
+
+        # Get image embeddings
+        last_hidden_state = outputs.vision_model_output[0]
+        image_embeds = self.owlvit.vision_model.post_layernorm(last_hidden_state)
+
+        # Resize class token
+        class_token_out = torch.broadcast_to(image_embeds[:, :1, :], image_embeds[:, :-1].shape)
+
+        # Merge image embedding with class tokens
+        image_embeds = image_embeds[:, 1:, :] * class_token_out
+        image_embeds = self.layer_norm(image_embeds)
+
+        # Resize to [batch_size, num_patches_height, num_patches_width, hidden_size]
+        new_size = (
+            image_embeds.shape[0],
+            num_patches_height,
+            num_patches_width,
+            image_embeds.shape[-1],
+        )
+        image_embeds = image_embeds.reshape(new_size)
+        text_embeds = outputs[-4]
+
+        return (text_embeds, image_embeds, outputs)
+
+    def image_embedder(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> tuple[torch.FloatTensor]:
+        # Get OwlViTModel vision embeddings (same as CLIP)
+        vision_outputs = self.owlvit.vision_model(
+            pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, return_dict=True
+        )
+
+        if interpolate_pos_encoding:
+            _, _, height, width = pixel_values.shape
+            num_patches_height = height // self.config.vision_config.patch_size
+            num_patches_width = width // self.config.vision_config.patch_size
+        else:
+            num_patches_height = self.num_patches_height
+            num_patches_width = self.num_patches_width
+
+        # Apply post_layernorm to last_hidden_state, return non-projected output
+        last_hidden_state = vision_outputs[0]
+        image_embeds = self.owlvit.vision_model.post_layernorm(last_hidden_state)
+
+        # Resize class token
+        class_token_out = torch.broadcast_to(image_embeds[:, :1, :], image_embeds[:, :-1].shape)
+
+        # Merge image embedding with class tokens
+        image_embeds = image_embeds[:, 1:, :] * class_token_out
+        image_embeds = self.layer_norm(image_embeds)
+
+        # Resize to [batch_size, num_patches_height, num_patches_width, hidden_size]
+        new_size = (
+            image_embeds.shape[0],
+            num_patches_height,
+            num_patches_width,
+            image_embeds.shape[-1],
+        )
+        image_embeds = image_embeds.reshape(new_size)
+
+        return (image_embeds, vision_outputs)
+
+    def embed_image_query(
+        self,
+        query_image_features: torch.FloatTensor,
+        query_feature_map: torch.FloatTensor,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.FloatTensor:
+        _, class_embeds = self.class_predictor(query_image_features)
+        pred_boxes = self.box_predictor(query_image_features, query_feature_map, interpolate_pos_encoding)
+        pred_boxes_as_corners = center_to_corners_format(pred_boxes)
+
+        # Loop over query images
+        best_class_embeds = []
+        best_box_indices = []
+        pred_boxes_device = pred_boxes_as_corners.device
+
+        for i in range(query_image_features.shape[0]):
+            each_query_box = torch.tensor([[0, 0, 1, 1]], device=pred_boxes_device)
+            each_query_pred_boxes = pred_boxes_as_corners[i]
+            ious, _ = box_iou(each_query_box, each_query_pred_boxes)
+
+            # If there are no overlapping boxes, fall back to generalized IoU
+            if torch.all(ious[0] == 0.0):
+                ious = generalized_box_iou(each_query_box, each_query_pred_boxes)
+
+            # Use an adaptive threshold to include all boxes within 80% of the best IoU
+            iou_threshold = torch.max(ious) * 0.8
+
+            selected_inds = (ious[0] >= iou_threshold).nonzero()
+            if selected_inds.numel():
+                selected_embeddings = class_embeds[i][selected_inds.squeeze(1)]
+                mean_embeds = torch.mean(class_embeds[i], axis=0)
+                mean_sim = torch.einsum("d,id->i", mean_embeds, selected_embeddings)
+                best_box_ind = selected_inds[torch.argmin(mean_sim)]
+                best_class_embeds.append(class_embeds[i][best_box_ind])
+                best_box_indices.append(best_box_ind)
+
+        if best_class_embeds:
+            query_embeds = torch.stack(best_class_embeds)
+            box_indices = torch.stack(best_box_indices)
+        else:
+            query_embeds, box_indices = None, None
+
+        return query_embeds, box_indices, pred_boxes
+
+    @auto_docstring
+    def image_guided_detection(
+        self,
+        pixel_values: torch.FloatTensor,
+        query_pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> OwlViTImageGuidedObjectDetectionOutput:
+        r"""
+        query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values of query image(s) to be detected. Pass in one query image per target image.
+
+        Examples:
+        ```python
+        >>> import requests
+        >>> from PIL import Image
+        >>> import torch
+        >>> from transformers import AutoProcessor, OwlViTForObjectDetection
+
+        >>> processor = AutoProcessor.from_pretrained("google/owlvit-base-patch16")
+        >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch16")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> query_url = "http://images.cocodataset.org/val2017/000000001675.jpg"
+        >>> query_image = Image.open(requests.get(query_url, stream=True).raw)
+        >>> inputs = processor(images=image, query_images=query_image, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     outputs = model.image_guided_detection(**inputs)
+        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
+        >>> target_sizes = torch.Tensor([image.size[::-1]])
+        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+        >>> results = processor.post_process_image_guided_detection(
+        ...     outputs=outputs, threshold=0.6, nms_threshold=0.3, target_sizes=target_sizes
+        ... )
+        >>> i = 0  # Retrieve predictions for the first image
+        >>> boxes, scores = results[i]["boxes"], results[i]["scores"]
+        >>> for box, score in zip(boxes, scores):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}")
+        Detected similar object with confidence 0.856 at location [10.94, 50.4, 315.8, 471.39]
+        Detected similar object with confidence 1.0 at location [334.84, 25.33, 636.16, 374.71]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # Compute feature maps for the input and query images
+        query_feature_map = self.image_embedder(
+            pixel_values=query_pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
+        )[0]
+        feature_map, vision_outputs = self.image_embedder(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+
+        batch_size, num_patches_height, num_patches_width, hidden_dim = feature_map.shape
+        image_feats = torch.reshape(feature_map, (batch_size, num_patches_height * num_patches_width, hidden_dim))
+
+        batch_size, num_patches_height, num_patches_width, hidden_dim = query_feature_map.shape
+        query_image_feats = torch.reshape(
+            query_feature_map, (batch_size, num_patches_height * num_patches_width, hidden_dim)
+        )
+        # Get top class embedding and best box index for each query image in batch
+        query_embeds, best_box_indices, query_pred_boxes = self.embed_image_query(
+            query_image_feats, query_feature_map, interpolate_pos_encoding
+        )
+
+        # Predict object classes [batch_size, num_patches, num_queries+1]
+        (pred_logits, class_embeds) = self.class_predictor(image_feats=image_feats, query_embeds=query_embeds)
+
+        # Predict object boxes
+        target_pred_boxes = self.box_predictor(image_feats, feature_map, interpolate_pos_encoding)
+
+        if not return_dict:
+            output = (
+                feature_map,
+                query_feature_map,
+                target_pred_boxes,
+                query_pred_boxes,
+                pred_logits,
+                class_embeds,
+                vision_outputs.to_tuple(),
+            )
+            output = tuple(x for x in output if x is not None)
+            return output
+
+        return OwlViTImageGuidedObjectDetectionOutput(
+            image_embeds=feature_map,
+            query_image_embeds=query_feature_map,
+            target_pred_boxes=target_pred_boxes,
+            query_pred_boxes=query_pred_boxes,
+            logits=pred_logits,
+            class_embeds=class_embeds,
+            text_model_output=None,
+            vision_model_output=vision_outputs,
+        )
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        pixel_values: torch.FloatTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> OwlViTObjectDetectionOutput:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids).
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the last hidden state. See `text_model_last_hidden_state` and
+            `vision_model_last_hidden_state` under returned tensors for more detail.
+
+        Examples:
+        ```python
+        >>> import requests
+        >>> from PIL import Image
+        >>> import torch
+
+        >>> from transformers import OwlViTProcessor, OwlViTForObjectDetection
+
+        >>> processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
+        >>> model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text_labels = [["a photo of a cat", "a photo of a dog"]]
+        >>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
+        >>> target_sizes = torch.tensor([(image.height, image.width)])
+        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+        >>> results = processor.post_process_grounded_object_detection(
+        ...     outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
+        ... )
+        >>> # Retrieve predictions for the first image for the corresponding text queries
+        >>> result = results[0]
+        >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
+        >>> for box, score, text_label in zip(boxes, scores, text_labels):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
+        Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29]
+        Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # Embed images and text queries
+        query_embeds, feature_map, outputs = self.image_text_embedder(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+
+        # Text and vision model outputs
+        text_outputs = outputs.text_model_output
+        vision_outputs = outputs.vision_model_output
+
+        batch_size, num_patches_height, num_patches_width, hidden_dim = feature_map.shape
+        image_feats = torch.reshape(feature_map, (batch_size, num_patches_height * num_patches_width, hidden_dim))
+
+        # Reshape from [batch_size * max_text_queries, hidden_dim] -> [batch_size, max_text_queries, hidden_dim]
+        max_text_queries = input_ids.shape[0] // batch_size
+        query_embeds = query_embeds.reshape(batch_size, max_text_queries, query_embeds.shape[-1])
+
+        # If first token is 0, then this is a padded query [batch_size, num_queries].
+        input_ids = input_ids.reshape(batch_size, max_text_queries, input_ids.shape[-1])
+        query_mask = input_ids[..., 0] > 0
+
+        # Predict object classes [batch_size, num_patches, num_queries+1]
+        (pred_logits, class_embeds) = self.class_predictor(image_feats, query_embeds, query_mask)
+
+        # Predict object boxes
+        pred_boxes = self.box_predictor(image_feats, feature_map, interpolate_pos_encoding)
+
+        if not return_dict:
+            output = (
+                pred_logits,
+                pred_boxes,
+                query_embeds,
+                feature_map,
+                class_embeds,
+                text_outputs.to_tuple(),
+                vision_outputs.to_tuple(),
+            )
+            output = tuple(x for x in output if x is not None)
+            return output
+
+        return OwlViTObjectDetectionOutput(
+            image_embeds=feature_map,
+            text_embeds=query_embeds,
+            pred_boxes=pred_boxes,
+            logits=pred_logits,
+            class_embeds=class_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+__all__ = ["OwlViTModel", "OwlViTPreTrainedModel", "OwlViTTextModel", "OwlViTVisionModel", "OwlViTForObjectDetection"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/processing_owlvit.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/processing_owlvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae39d2b6b30783fb7281b56f874f8a840b129732
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/owlvit/processing_owlvit.py
@@ -0,0 +1,322 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for OWL-ViT
+"""
+
+import warnings
+from typing import TYPE_CHECKING, Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import (
+    ImagesKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+)
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import TensorType, is_flax_available, is_tf_available, is_torch_available
+
+
+if TYPE_CHECKING:
+    from .modeling_owlvit import OwlViTImageGuidedObjectDetectionOutput, OwlViTObjectDetectionOutput
+
+
+class OwlViTImagesKwargs(ImagesKwargs, total=False):
+    query_images: Optional[ImageInput]
+
+
+class OwlViTProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: OwlViTImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": "max_length",
+        },
+        "images_kwargs": {},
+        "common_kwargs": {
+            "return_tensors": "np",
+        },
+    }
+
+
+class OwlViTProcessor(ProcessorMixin):
+    r"""
+    Constructs an OWL-ViT processor which wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`]
+    into a single processor that inherits both the image processor and tokenizer functionalities. See the
+    [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`OwlViTImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "OwlViTImageProcessor"
+    tokenizer_class = ("CLIPTokenizer", "CLIPTokenizerFast")
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[OwlViTProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
+        `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`,
+            `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The query image to be prepared, one query image is expected per target image to be queried. Each image
+                can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
+                should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **query_pixel_values** -- Pixel values of the query images to be fed to a model. Returned when `query_images` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            OwlViTProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        query_images = output_kwargs["images_kwargs"].pop("query_images", None)
+        return_tensors = output_kwargs["common_kwargs"]["return_tensors"]
+
+        if text is None and query_images is None and images is None:
+            raise ValueError(
+                "You have to specify at least one text or query image or image. All three cannot be none."
+            )
+
+        data = {}
+        if text is not None:
+            if isinstance(text, str) or (isinstance(text, list) and not isinstance(text[0], list)):
+                encodings = [self.tokenizer(text, **output_kwargs["text_kwargs"])]
+
+            elif isinstance(text, list) and isinstance(text[0], list):
+                encodings = []
+
+                # Maximum number of queries across batch
+                max_num_queries = max(len(text_single) for text_single in text)
+
+                # Pad all batch samples to max number of text queries
+                for text_single in text:
+                    if len(text_single) != max_num_queries:
+                        text_single = text_single + [" "] * (max_num_queries - len(text_single))
+
+                    encoding = self.tokenizer(text_single, **output_kwargs["text_kwargs"])
+                    encodings.append(encoding)
+            else:
+                raise TypeError("Input text should be a string, a list of strings or a nested list of strings")
+
+            if return_tensors == "np":
+                input_ids = np.concatenate([encoding["input_ids"] for encoding in encodings], axis=0)
+                attention_mask = np.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0)
+
+            elif return_tensors == "jax" and is_flax_available():
+                import jax.numpy as jnp
+
+                input_ids = jnp.concatenate([encoding["input_ids"] for encoding in encodings], axis=0)
+                attention_mask = jnp.concatenate([encoding["attention_mask"] for encoding in encodings], axis=0)
+
+            elif return_tensors == "pt" and is_torch_available():
+                import torch
+
+                input_ids = torch.cat([encoding["input_ids"] for encoding in encodings], dim=0)
+                attention_mask = torch.cat([encoding["attention_mask"] for encoding in encodings], dim=0)
+
+            elif return_tensors == "tf" and is_tf_available():
+                import tensorflow as tf
+
+                input_ids = tf.stack([encoding["input_ids"] for encoding in encodings], axis=0)
+                attention_mask = tf.stack([encoding["attention_mask"] for encoding in encodings], axis=0)
+
+            else:
+                raise ValueError("Target return tensor type could not be returned")
+
+            data["input_ids"] = input_ids
+            data["attention_mask"] = attention_mask
+
+        if query_images is not None:
+            query_pixel_values = self.image_processor(query_images, **output_kwargs["images_kwargs"]).pixel_values
+            # Query images always override the text prompt
+            data = {"query_pixel_values": query_pixel_values}
+
+        if images is not None:
+            image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
+            data["pixel_values"] = image_features.pixel_values
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def post_process(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to [`OwlViTImageProcessor.post_process`]. Please refer to the docstring
+        of this method for more information.
+        """
+        return self.image_processor.post_process(*args, **kwargs)
+
+    def post_process_object_detection(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to [`OwlViTImageProcessor.post_process_object_detection`]. Please refer
+        to the docstring of this method for more information.
+        """
+        warnings.warn(
+            "`post_process_object_detection` method is deprecated for OwlVitProcessor and will be removed in v5. "
+            "Use `post_process_grounded_object_detection` instead.",
+            FutureWarning,
+        )
+        return self.image_processor.post_process_object_detection(*args, **kwargs)
+
+    def post_process_grounded_object_detection(
+        self,
+        outputs: "OwlViTObjectDetectionOutput",
+        threshold: float = 0.1,
+        target_sizes: Optional[Union[TensorType, list[tuple]]] = None,
+        text_labels: Optional[list[list[str]]] = None,
+    ):
+        """
+        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format.
+
+        Args:
+            outputs ([`OwlViTObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.1):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+            text_labels (`list[list[str]]`, *optional*):
+                List of lists of text labels for each image in the batch. If unset, "text_labels" in output will be
+                set to `None`.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the following keys:
+            - "scores": The confidence scores for each predicted box on the image.
+            - "labels": Indexes of the classes predicted by the model on the image.
+            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
+            - "text_labels": The text labels for each predicted bounding box on the image.
+        """
+        output = self.image_processor.post_process_object_detection(
+            outputs=outputs, threshold=threshold, target_sizes=target_sizes
+        )
+
+        if text_labels is not None and len(text_labels) != len(output):
+            raise ValueError("Make sure that you pass in as many lists of text labels as images")
+
+        # adding text labels to the output
+        if text_labels is not None:
+            for image_output, image_text_labels in zip(output, text_labels):
+                object_text_labels = [image_text_labels[i] for i in image_output["labels"]]
+                image_output["text_labels"] = object_text_labels
+        else:
+            for image_output in output:
+                image_output["text_labels"] = None
+
+        return output
+
+    def post_process_image_guided_detection(
+        self,
+        outputs: "OwlViTImageGuidedObjectDetectionOutput",
+        threshold: float = 0.0,
+        nms_threshold: float = 0.3,
+        target_sizes: Optional[Union[TensorType, list[tuple]]] = None,
+    ):
+        """
+        Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
+        api.
+
+        Args:
+            outputs ([`OwlViTImageGuidedObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.0):
+                Minimum confidence threshold to use to filter out predicted boxes.
+            nms_threshold (`float`, *optional*, defaults to 0.3):
+                IoU threshold for non-maximum suppression of overlapping boxes.
+            target_sizes (`torch.Tensor`, *optional*):
+                Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
+                the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
+                None, predictions will not be unnormalized.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the following keys:
+            - "scores": The confidence scores for each predicted box on the image.
+            - "boxes": Image bounding boxes in (top_left_x, top_left_y, bottom_right_x, bottom_right_y) format.
+            - "labels": Set to `None`.
+        """
+        return self.image_processor.post_process_image_guided_detection(
+            outputs=outputs, threshold=threshold, nms_threshold=nms_threshold, target_sizes=target_sizes
+        )
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
+
+
+__all__ = ["OwlViTProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14e75f132c14d77feea8fa3f7f4f1f655b9eb1cb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/configuration_perceiver.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/configuration_perceiver.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d521cb16109bbb655b372656920fc8db95cad8b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/configuration_perceiver.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/feature_extraction_perceiver.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/feature_extraction_perceiver.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b19baaab45581b316741199f2b467befbf9d55b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/feature_extraction_perceiver.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/image_processing_perceiver.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/image_processing_perceiver.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd1c0a07dfa49a5e9d5baafe74dea9f055460789
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/image_processing_perceiver.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/image_processing_perceiver_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/image_processing_perceiver_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06a2ba66ccfc9b432189d3f00ac72b0f1d7a0f7a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/image_processing_perceiver_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/tokenization_perceiver.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/tokenization_perceiver.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e5d87f0189b9456a7442e9c24d2b449b8c0e880
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perceiver/__pycache__/tokenization_perceiver.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cb1e7a9cd04fb32cb8eb29516d95c0fc8e9d108
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_phi3 import *
+    from .modeling_phi3 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/configuration_phi3.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/configuration_phi3.py
new file mode 100644
index 0000000000000000000000000000000000000000..33cee6b37ba57ed4c1010f78646274a09489e33d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/configuration_phi3.py
@@ -0,0 +1,240 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Phi-3 model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Phi3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the
+    [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32064):
+            Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Phi3Model`].
+        hidden_size (`int`, *optional*, defaults to 3072):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            Dropout probability for mlp outputs.
+        embd_pdrop (`int`, *optional*, defaults to 0.0):
+            The dropout ratio for the embeddings.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio after computing the attention scores.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        original_max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model was trained with. This is used to determine the size of the
+            original RoPE embeddings when using long scaling.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon value used for the RMSNorm.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
+            contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
+            the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
+            divided by the number of attention heads divided by 2.
+        partial_rotary_factor (`float`, *optional*, defaults to 1.0):
+            Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 32000):
+            The id of the "end-of-sequence" token.
+        pad_token_id (`int`, *optional*, defaults to 32000):
+            The id of the padding token.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If `None`, no sliding window is applied.
+
+    Example:
+
+    ```python
+    >>> from transformers import Phi3Model, Phi3Config
+
+    >>> # Initializing a Phi-3 style configuration
+    >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+
+    >>> # Initializing a model from the configuration
+    >>> model = Phi3Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "phi3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.qkv_proj": "colwise_rep",  # we need to replicate here due to the slicing of qkv
+        "layers.*.self_attn.o_proj": "rowwise_rep",  # we need to replicate here due to the slicing of qkv
+        "layers.*.mlp.gate_up_proj": "colwise_rep",  # we need to replicate here due to the `chunk` operation
+        "layers.*.mlp.down_proj": "rowwise_rep",  # we need to replicate here due to the `chunk` operation
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=32064,
+        hidden_size=3072,
+        intermediate_size=8192,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attention_dropout=0.0,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        original_max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        partial_rotary_factor=1.0,
+        bos_token_id=1,
+        eos_token_id=32000,
+        pad_token_id=32000,
+        sliding_window=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.partial_rotary_factor = partial_rotary_factor
+        self._rope_scaling_adjustment()
+        self._rope_scaling_validation()
+        self.sliding_window = sliding_window
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_adjustment(self):
+        """
+        Adjust the `type` of the `rope_scaling` configuration for backward compatibility.
+        """
+        if self.rope_scaling is None:
+            return
+
+        rope_scaling_type = self.rope_scaling.get("type", None)
+
+        # For backward compatibility if previous version used "su" or "yarn"
+        if rope_scaling_type is not None and rope_scaling_type in ["su", "yarn"]:
+            self.rope_scaling["type"] = "longrope"
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
+        rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
+        if rope_scaling_type is None or rope_scaling_type != "longrope":
+            raise ValueError(f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}")
+        if not (
+            isinstance(rope_scaling_short_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
+            )
+        rotary_ndims = int(self.hidden_size // self.num_attention_heads * self.partial_rotary_factor)
+        if not len(rope_scaling_short_factor) == rotary_ndims // 2:
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_short_factor)}"
+            )
+        if not (
+            isinstance(rope_scaling_long_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
+            )
+        if not len(rope_scaling_long_factor) == rotary_ndims // 2:
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_long_factor)}"
+            )
+
+
+__all__ = ["Phi3Config"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/modeling_phi3.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/modeling_phi3.py
new file mode 100644
index 0000000000000000000000000000000000000000..23820075a02056eb6b358cc25529f60b0a784364
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/modeling_phi3.py
@@ -0,0 +1,547 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/phi3/modular_phi3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_phi3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from transformers.utils.generic import check_model_inputs
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import (
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_phi3 import Phi3Config
+
+
+class Phi3MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+
+        return self.down_proj(up_states)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+    q_embed = torch.cat([(q_rot * cos) + (rotate_half(q_rot) * sin), q_pass], dim=-1)
+    k_embed = torch.cat([(k_rot * cos) + (rotate_half(k_rot) * sin), k_pass], dim=-1)
+    return q_embed, k_embed
+
+
+class Phi3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        op_size = config.num_attention_heads * self.head_dim + 2 * (config.num_key_value_heads * self.head_dim)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+        self.qkv_proj = nn.Linear(config.hidden_size, op_size, bias=False)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.config.num_attention_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+        query_states = query_states.view(hidden_shape).transpose(1, 2)
+        key_states = key_states.view(hidden_shape).transpose(1, 2)
+        value_states = value_states.view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=getattr(self.config, "sliding_window", None),
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class Phi3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Phi3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Phi3DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Phi3Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Phi3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Phi3MLP(config)
+        self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.config = config
+        self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
+        self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + self.resid_attn_dropout(hidden_states)  # main diff with Llama
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.resid_mlp_dropout(hidden_states)  # main diff with Llama
+        return hidden_states
+
+
+@auto_docstring
+class Phi3PreTrainedModel(PreTrainedModel):
+    config: Phi3Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Phi3DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Phi3DecoderLayer,
+        "attentions": Phi3Attention,
+    }
+    _version = "0.0.5"
+
+
+class Phi3RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Phi3Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class Phi3Model(Phi3PreTrainedModel):
+    def __init__(self, config: Phi3Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Phi3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        mask_function = create_causal_mask if self.config.sliding_window is None else create_sliding_window_causal_mask
+        causal_mask = mask_function(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+
+
+@auto_docstring
+class Phi3ForCausalLM(Phi3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Phi3Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Phi3ForCausalLM
+
+        >>> model = Phi3ForCausalLM.from_pretrained("meta-phi3/Phi3-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-phi3/Phi3-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- this model may need to switch between short and long rope, invalidating the cache in the
+        # process
+
+        # When the first time input length reached long and short factor switching point, enforce re-compute cache
+        # It will cause downside of slower at this single token position, however, better than current failure.
+        if (
+            past_key_values
+            and self.config.rope_scaling
+            and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1
+        ):
+            past_length = cache_position[0]
+            if past_length <= self.config.original_max_position_embeddings:
+                past_key_values = None
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+        return model_inputs
+
+
+class Phi3ForSequenceClassification(GenericForSequenceClassification, Phi3PreTrainedModel):
+    pass
+
+
+class Phi3ForTokenClassification(GenericForTokenClassification, Phi3PreTrainedModel):
+    pass
+
+
+__all__ = [
+    "Phi3PreTrainedModel",
+    "Phi3Model",
+    "Phi3ForCausalLM",
+    "Phi3ForSequenceClassification",
+    "Phi3ForTokenClassification",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/modular_phi3.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/modular_phi3.py
new file mode 100644
index 0000000000000000000000000000000000000000..d355c3792a6b056165d6acbd0db8677bb9784728
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi3/modular_phi3.py
@@ -0,0 +1,272 @@
+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PyTorch Phi-3 model."""
+
+from typing import Callable, Optional
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import logging
+from ...utils.deprecation import deprecate_kwarg
+from ..mistral.modeling_mistral import (
+    MistralDecoderLayer,
+    MistralForCausalLM,
+    MistralForSequenceClassification,
+    MistralForTokenClassification,
+    MistralPreTrainedModel,
+    eager_attention_forward,
+    rotate_half,
+)
+from .configuration_phi3 import Phi3Config
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct"
+_CONFIG_FOR_DOC = "Phi3Config"
+
+
+class Phi3MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+
+        return self.down_proj(up_states)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+    q_embed = torch.cat([(q_rot * cos) + (rotate_half(q_rot) * sin), q_pass], dim=-1)
+    k_embed = torch.cat([(k_rot * cos) + (rotate_half(k_rot) * sin), k_pass], dim=-1)
+    return q_embed, k_embed
+
+
+class Phi3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        op_size = config.num_attention_heads * self.head_dim + 2 * (config.num_key_value_heads * self.head_dim)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+        self.qkv_proj = nn.Linear(config.hidden_size, op_size, bias=False)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        qkv = self.qkv_proj(hidden_states)
+        query_pos = self.config.num_attention_heads * self.head_dim
+        query_states = qkv[..., :query_pos]
+        key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
+        value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
+
+        query_states = query_states.view(hidden_shape).transpose(1, 2)
+        key_states = key_states.view(hidden_shape).transpose(1, 2)
+        value_states = value_states.view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=getattr(self.config, "sliding_window", None),
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Phi3DecoderLayer(MistralDecoderLayer):
+    def __init__(self, config: Phi3Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.config = config
+        self.self_attn = Phi3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Phi3MLP(config)
+        self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
+        self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + self.resid_attn_dropout(hidden_states)  # main diff with Llama
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.resid_mlp_dropout(hidden_states)  # main diff with Llama
+        return hidden_states
+
+
+class Phi3PreTrainedModel(MistralPreTrainedModel):
+    _version = "0.0.5"
+
+
+class Phi3ForCausalLM(MistralForCausalLM):
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- this model may need to switch between short and long rope, invalidating the cache in the
+        # process
+
+        # When the first time input length reached long and short factor switching point, enforce re-compute cache
+        # It will cause downside of slower at this single token position, however, better than current failure.
+        if (
+            past_key_values
+            and self.config.rope_scaling
+            and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1
+        ):
+            past_length = cache_position[0]
+            if past_length <= self.config.original_max_position_embeddings:
+                past_key_values = None
+
+        model_inputs = GenerationMixin.prepare_inputs_for_generation(
+            self,
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+        return model_inputs
+
+
+class Phi3ForSequenceClassification(MistralForSequenceClassification):
+    pass
+
+
+class Phi3ForTokenClassification(MistralForTokenClassification):
+    pass
+
+
+__all__ = [
+    "Phi3PreTrainedModel",
+    "Phi3Model",  # noqa: F822
+    "Phi3ForCausalLM",
+    "Phi3ForSequenceClassification",
+    "Phi3ForTokenClassification",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52eb154040e56d482fdb25840afdaa84b062ecf2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/configuration_phimoe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/configuration_phimoe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a7625a06c47dc307419efdab05d7b11068dcc43a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/configuration_phimoe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/modeling_phimoe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/modeling_phimoe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aef8ae6edd715d71645f8c78984dccbef7f56791
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phimoe/__pycache__/modeling_phimoe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a724c22e13a040c5dbb1e9f2b17045e1cfab04f3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/configuration_pop2piano.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/configuration_pop2piano.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8eb49d9785b4eb70fd8513a741a2a4ae2fc54956
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/configuration_pop2piano.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/feature_extraction_pop2piano.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/feature_extraction_pop2piano.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a23da2fba74892aeb88758560fbbd7ce5fce17c6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/feature_extraction_pop2piano.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/modeling_pop2piano.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/modeling_pop2piano.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aee60b88331b873e921e9a5e460b4c6e71d5ca03
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/modeling_pop2piano.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/processing_pop2piano.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/processing_pop2piano.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de949392e04026370f846ccdcf9ecd834dbf0481
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/processing_pop2piano.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/tokenization_pop2piano.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/tokenization_pop2piano.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5aaa2f67ad61c9608ed9cbf831a48ffdd33fd65
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pop2piano/__pycache__/tokenization_pop2piano.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2062d730e55ca57e0c8901b53a2927611654c71
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/configuration_prompt_depth_anything.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/configuration_prompt_depth_anything.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b84d359475e580df6214a6ab60bf28ec5cc47b7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/configuration_prompt_depth_anything.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd1e1308cc1265a16632aba88c3241517dd08b22
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1ffdcb6d777783052422e55652f2ba47726ca81
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/modeling_prompt_depth_anything.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/modeling_prompt_depth_anything.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ed2f8e310292b8d3b71e78c82ea0e6b838c9225
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/modeling_prompt_depth_anything.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/modular_prompt_depth_anything.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/modular_prompt_depth_anything.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e45ef66a183f2748bab1462194d026f5efc724f5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prompt_depth_anything/__pycache__/modular_prompt_depth_anything.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60e5656459c2469790dc8e34ab3c31a1a37a698c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/configuration_prophetnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/configuration_prophetnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc2494c12750abc3ee449c9896f2eef7fcf368cd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/configuration_prophetnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/modeling_prophetnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/modeling_prophetnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9f21035aab5d55669af855be12bc9dc116b6332
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/modeling_prophetnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/tokenization_prophetnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/tokenization_prophetnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56b2e03277c67ad6939ca44a5f4dda2847f3ec20
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/prophetnet/__pycache__/tokenization_prophetnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47209372d997a1579385d2139cca741b9c63f752
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/modeling_pvt_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/modeling_pvt_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20ee818ed82ecd23ccbf7e08a88d9ca2ad38a111
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/modeling_pvt_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e30f6c7325b4acba3f38ac076ab915352bec707
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/configuration_qwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/configuration_qwen2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08d4ecb0cb6bcefa2fc79276635c0947028d0874
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/configuration_qwen2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/modeling_qwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/modeling_qwen2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..751b383d34a48812fcd1c665d3f5fd00b2d3171a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/modeling_qwen2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/modular_qwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/modular_qwen2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84a8989cc983f69f0ae24bcf08a26b58a937aa15
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/modular_qwen2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/tokenization_qwen2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/tokenization_qwen2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3608263468e8f502e0551b75b3d47c6b996cc697
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/tokenization_qwen2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/tokenization_qwen2_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/tokenization_qwen2_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02ad73526392560a9c3a5f01219ecc5b73ce5435
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2/__pycache__/tokenization_qwen2_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62e01fea648c4ea6d62786a2896733c35c704353
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/configuration_qwen3_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/configuration_qwen3_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15f84c9be4b76b0cbf1e6d36ce99b867b7c9c9c6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/configuration_qwen3_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/modeling_qwen3_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/modeling_qwen3_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf267058d6d43dcf23d504c10a22082bf7c7cd9e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/modeling_qwen3_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/modular_qwen3_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/modular_qwen3_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f4c957d500ad4084cc73b094d8bd87ac94e0d1f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_moe/__pycache__/modular_qwen3_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c025276e6145169e6240c7a442fea6f829598ba7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/configuration_qwen3_omni_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/configuration_qwen3_omni_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a295d582056d3da5ebf931c3da71a3cfafc84ee4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/configuration_qwen3_omni_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/processing_qwen3_omni_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/processing_qwen3_omni_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84ef03443bd6176f1522de3866eac9363ab34f76
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_omni_moe/__pycache__/processing_qwen3_omni_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/configuration_qwen3_vl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/configuration_qwen3_vl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34372fe818de71a0c7e81c0f284372aa404d57fc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/configuration_qwen3_vl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/modeling_qwen3_vl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/modeling_qwen3_vl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32d3438aa57738f63a234b26970dabc82c77bf8d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/modeling_qwen3_vl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/modular_qwen3_vl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/modular_qwen3_vl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4090a0bfedf990711d933830c82d383e6054dfd7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/modular_qwen3_vl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/video_processing_qwen3_vl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/video_processing_qwen3_vl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b917b24e143dada98220305dab6e1f965a3b2c6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/video_processing_qwen3_vl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4000cb272723d9920136a6c78465e8413a8b4d1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_qwen3_vl_moe import *
+    from .modeling_qwen3_vl_moe import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..25358aa79bff482437632829f6319effad36f138
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py
@@ -0,0 +1,335 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_qwen3_vl_moe.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class Qwen3VLMoeTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3VLMoeTextModel`]. It is used to instantiate a
+    Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3-VL-30B-A3B-Instruct [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2MoE model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2MoeModel`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 5632):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 128000):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 5000000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        decoder_sparse_step (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer.
+        moe_intermediate_size (`int`, *optional*, defaults to 1408):
+            Intermediate size of the routed expert.
+        num_experts_per_tok (`int`, *optional*, defaults to 4):
+            Number of selected experts.
+        num_experts (`int`, *optional*, defaults to 60):
+            Number of routed experts.
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the topk probabilities.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        mlp_only_layers (`List[int]`, *optional*, defaults to `[]`):
+            Indicate which layers use Qwen3VLMoeMLP rather than Qwen3VLMoeSparseMoeBlock
+            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
+            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        head_dim (`int`, *optional*):
+            The dimension of the head. If not specified, will default to `hidden_size // num_attention_heads`.
+
+    ```python
+    >>> from transformers import Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeConfig
+
+    >>> # Initializing a Qwen3VLMoe style configuration
+    >>> configuration = Qwen3VLMoeConfig()
+
+    >>> # Initializing a model from the Qwen3-VL-30B-A3B style configuration
+    >>> model = Qwen3VLMoeForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_vl_moe_text"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Qwen3VLMoe`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=2048,
+        intermediate_size=5632,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        hidden_act="silu",
+        max_position_embeddings=128000,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=5000000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        decoder_sparse_step=1,
+        moe_intermediate_size=1408,
+        num_experts_per_tok=4,
+        num_experts=60,
+        norm_topk_prob=True,
+        router_aux_loss_coef=0.001,
+        mlp_only_layers=None,
+        rope_scaling=None,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+        self.head_dim = head_dim or hidden_size // num_attention_heads
+
+        rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
+
+        # MoE arguments
+        self.decoder_sparse_step = decoder_sparse_step
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.norm_topk_prob = norm_topk_prob
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+class Qwen3VLMoeVisionConfig(PretrainedConfig):
+    model_type = "qwen3_vl_moe"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        depth=27,
+        hidden_size=1152,
+        hidden_act="gelu_pytorch_tanh",
+        intermediate_size=4304,
+        num_heads=16,
+        in_channels=3,
+        patch_size=16,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        out_hidden_size=3584,
+        num_position_embeddings=2304,
+        deepstack_visual_indexes=[8, 16, 24],
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.out_hidden_size = out_hidden_size
+        self.num_position_embeddings = num_position_embeddings
+        self.initializer_range = initializer_range
+        self.deepstack_visual_indexes = deepstack_visual_indexes
+
+
+class Qwen3VLMoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3VLMoeModel`]. It is used to instantiate a
+    Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3-VL-30B-A3B-Instruct [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLMoeTextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Qwen3VLMoeVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 151655):
+            The image token index to encode the image prompt.
+        video_token_id (`int`, *optional*, defaults to 151656):
+            The video token index to encode the image prompt.
+        vision_start_token_id (`int`, *optional*, defaults to 151652):
+            The start token index to encode the image prompt.
+        vision_end_token_id (`int`, *optional*, defaults to 151653):
+            The end token index to encode the image prompt.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie the word embeddings.
+
+    ```python
+    >>> from transformers import Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeConfig
+
+    >>> # Initializing a Qwen3-VL-MOE style configuration
+    >>> configuration = Qwen3VLMoeConfig()
+
+    >>> # Initializing a model from the Qwen3-VL-30B-A3B style configuration
+    >>> model = Qwen3VLMoeForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_vl_moe"
+    sub_configs = {"vision_config": Qwen3VLMoeVisionConfig, "text_config": Qwen3VLMoeTextConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151655,
+        video_token_id=151656,
+        vision_start_token_id=151652,
+        vision_end_token_id=151653,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            self.text_config = self.sub_configs["text_config"]()
+
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.vision_end_token_id = vision_end_token_id
+        super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
+
+
+__all__ = ["Qwen3VLMoeConfig", "Qwen3VLMoeTextConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..88e3f6e19f0eadbda8853b5c3b8f39fc7e61e580
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py
@@ -0,0 +1,1832 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_qwen3_vl_moe.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import OutputRecorder, check_model_inputs
+from .configuration_qwen3_vl_moe import Qwen3VLMoeConfig, Qwen3VLMoeTextConfig, Qwen3VLMoeVisionConfig
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class Qwen3VLMoeTextRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen3VLMoeTextRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Qwen3VLMoeTextExperts(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.intermediate_size = config.moe_intermediate_size
+        self.hidden_size = config.hidden_size
+        self.expert_dim = self.intermediate_size
+        self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
+        self.down_proj = nn.Parameter(torch.empty((self.num_experts, self.expert_dim, self.hidden_size)))
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(
+        self, hidden_states: torch.Tensor, routing_weights: torch.Tensor, router_indices: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        When training it is more efficient to just loop over the experts and compute the output for each expert
+        as otherwise the memory would explode.
+
+        For inference we can sacrifice some memory and compute the output for all experts at once. By repeating the inputs.
+
+        Args:
+            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
+            routing_weights (torch.Tensor): (batch_size * token_num, num_experts)
+            router_indices (torch.Tensor): (batch_size * token_num, top_k)
+        Returns:
+            torch.Tensor
+        """
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)  # (num_tokens, hidden_size)
+        if self.training:
+            next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
+            with torch.no_grad():
+                expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=self.num_experts)
+                expert_mask = expert_mask.permute(2, 1, 0)
+                # we sum on the top_k and on the sequence length to get which experts
+                # are hit this time around
+                expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            for expert_idx in expert_hit[:]:
+                with torch.no_grad():
+                    _, token_idx = torch.where(expert_mask[expert_idx[0]])
+                current_state = hidden_states[token_idx]
+                gate_up = current_state @ self.gate_up_proj[expert_idx]
+                gate, up = gate_up.chunk(2, dim=-1)
+                gated_output = up * self.act_fn(gate)
+                out = gated_output @ self.down_proj[expert_idx]
+                weighted_output = out[0] * routing_weights[token_idx, expert_idx, None]
+                next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
+            next_states = next_states.view(batch_size, -1, self.hidden_size)
+        else:
+            hidden_states = hidden_states.repeat(self.num_experts, 1)
+            hidden_states = hidden_states.view(self.num_experts, -1, self.hidden_size)
+            gate_up = torch.bmm(hidden_states, self.gate_up_proj)
+            gate, up = gate_up.chunk(2, dim=-1)  # not supported for DTensors
+            next_states = torch.bmm((up * self.act_fn(gate)), self.down_proj)
+            next_states = next_states.reshape(self.num_experts, batch_size, -1, self.hidden_size)
+            next_states = (
+                next_states * routing_weights.transpose(0, 1).view(self.num_experts, batch_size, -1)[..., None]
+            )
+            next_states = next_states.sum(dim=0)
+        return next_states
+
+
+class Qwen3VLMoeTextSparseMoeBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
+        self.experts = Qwen3VLMoeTextExperts(config)
+
+        # since all the models use norm_topk_prob, we don't need to have a extra check for it
+        # self.norm_topk_prob = config.norm_topk_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)
+        router_logits = self.gate(hidden_states)
+        routing_weights = torch.nn.functional.softmax(router_logits, dim=-1, dtype=torch.float)
+        routing_weights, router_indices = torch.topk(routing_weights, self.top_k, dim=-1)
+        routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        router_weights = torch.zeros_like(router_logits).scatter_(1, router_indices, routing_weights)
+        hidden_states = hidden_states.reshape(batch_size, -1, self.hidden_size)
+        routed_out = self.experts(hidden_states, router_weights, router_indices)
+        return routed_out, router_logits
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class Qwen3VLMoeTextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Qwen3VLMoeTextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.q_norm = Qwen3VLMoeTextRMSNorm(
+            self.head_dim, eps=config.rms_norm_eps
+        )  # unlike olmo, only on the head dim!
+        self.k_norm = Qwen3VLMoeTextRMSNorm(
+            self.head_dim, eps=config.rms_norm_eps
+        )  # thus post q_norm does not need reshape
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Qwen3VLMoeTextMLP(nn.Module):
+    def __init__(self, config, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = intermediate_size if intermediate_size is not None else config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class Qwen3VLMoeTextDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Qwen3VLMoeTextConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Qwen3VLMoeTextAttention(config, layer_idx)
+
+        if (layer_idx not in config.mlp_only_layers) and (
+            config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
+        ):
+            self.mlp = Qwen3VLMoeTextSparseMoeBlock(config)
+        else:
+            self.mlp = Qwen3VLMoeTextMLP(config, intermediate_size=config.intermediate_size)
+
+        self.input_layernorm = Qwen3VLMoeTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3VLMoeTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> torch.FloatTensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss,
+                and should not be returned during inference.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Cache`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        # For the MoE layers, we need to unpack
+        if isinstance(hidden_states, tuple):
+            hidden_states, _ = hidden_states
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+@auto_docstring
+class Qwen3VLMoePreTrainedModel(PreTrainedModel):
+    config: Qwen3VLMoeConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen3VLMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = False  # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "router_logits": OutputRecorder(Qwen3VLMoeTextSparseMoeBlock, index=1),
+        "hidden_states": Qwen3VLMoeTextDecoderLayer,
+        "attentions": Qwen3VLMoeTextAttention,
+    }
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        super()._init_weights(module)
+        if hasattr(self.config, "initializer_range"):
+            std = self.config.initializer_range
+        else:
+            std = getattr(self.config.get_text_config(), "initializer_range", 0.02)
+        if isinstance(module, Qwen3VLMoeTextExperts):
+            module.gate_up_proj.data.normal_(mean=0.0, std=std)
+            module.down_proj.data.normal_(mean=0.0, std=std)
+
+
+class Qwen3VLMoeVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.linear_fc1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=True)
+        self.linear_fc2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=True)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        return self.linear_fc2(self.act_fn(self.linear_fc1(hidden_state)))
+
+
+class Qwen3VLMoeVisionPatchEmbed(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.patch_size = config.patch_size
+        self.temporal_patch_size = config.temporal_patch_size
+        self.in_channels = config.in_channels
+        self.embed_dim = config.hidden_size
+
+        kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size]
+        self.proj = nn.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.view(
+            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+        )
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+
+
+class Qwen3VLMoeVisionRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class Qwen3VLMoeVisionPatchMerger(nn.Module):
+    def __init__(self, config: Qwen3VLMoeVisionConfig, use_postshuffle_norm=False) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size * (config.spatial_merge_size**2)
+        self.use_postshuffle_norm = use_postshuffle_norm
+        self.norm = nn.LayerNorm(self.hidden_size if use_postshuffle_norm else config.hidden_size, eps=1e-6)
+        self.linear_fc1 = nn.Linear(self.hidden_size, self.hidden_size)
+        self.act_fn = nn.GELU()
+        self.linear_fc2 = nn.Linear(self.hidden_size, config.out_hidden_size)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x.view(-1, self.hidden_size) if self.use_postshuffle_norm else x).view(-1, self.hidden_size)
+        x = self.linear_fc2(self.act_fn(self.linear_fc1(x)))
+        return x
+
+
+def apply_rotary_pos_emb_vision(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+    cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    q_embed = q_embed.to(orig_q_dtype)
+    k_embed = k_embed.to(orig_k_dtype)
+    return q_embed, k_embed
+
+
+class Qwen3VLMoeVisionAttention(nn.Module):
+    def __init__(self, config: Qwen3VLMoeVisionConfig) -> None:
+        super().__init__()
+        self.dim = config.hidden_size
+        self.num_heads = config.num_heads
+        self.head_dim = self.dim // self.num_heads
+        self.num_key_value_groups = 1  # needed for eager attention
+        self.qkv = nn.Linear(self.dim, self.dim * 3, bias=True)
+        self.proj = nn.Linear(self.dim, self.dim)
+        self.scaling = self.head_dim**-0.5
+        self.config = config
+        self.attention_dropout = 0.0
+        self.is_causal = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        query_states, key_states, value_states = (
+            self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        )
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin)
+
+        query_states = query_states.transpose(0, 1).unsqueeze(0)
+        key_states = key_states.transpose(0, 1).unsqueeze(0)
+        value_states = value_states.transpose(0, 1).unsqueeze(0)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        if self.config._attn_implementation == "flash_attention_2":
+            # Flash Attention 2: Use cu_seqlens for variable length attention
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            attn_output, _ = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask=None,
+                scaling=self.scaling,
+                dropout=0.0 if not self.training else self.attention_dropout,
+                cu_seq_lens_q=cu_seqlens,
+                cu_seq_lens_k=cu_seqlens,
+                max_length_q=max_seqlen,
+                max_length_k=max_seqlen,
+                is_causal=False,
+                **kwargs,
+            )
+        else:
+            # Other implementations: Process each chunk separately
+            lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+            splits = [
+                torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
+            ]
+
+            attn_outputs = [
+                attention_interface(
+                    self,
+                    q,
+                    k,
+                    v,
+                    attention_mask=None,
+                    scaling=self.scaling,
+                    dropout=0.0 if not self.training else self.attention_dropout,
+                    is_causal=False,
+                    **kwargs,
+                )[0]
+                for q, k, v in zip(*splits)
+            ]
+            attn_output = torch.cat(attn_outputs, dim=1)
+
+        attn_output = attn_output.reshape(seq_length, -1).contiguous()
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+class Qwen3VLMoeVisionBlock(GradientCheckpointingLayer):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=1e-6)
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=1e-6)
+        self.attn = Qwen3VLMoeVisionAttention(config=config)
+        self.mlp = Qwen3VLMoeVisionMLP(config=config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+
+
+class Qwen3VLMoeVisionModel(Qwen3VLMoePreTrainedModel):
+    config: Qwen3VLMoeVisionConfig
+    _no_split_modules = ["Qwen3VLMoeVisionBlock"]
+
+    def __init__(self, config, *inputs, **kwargs) -> None:
+        super().__init__(config, *inputs, **kwargs)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_size = config.patch_size
+        self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
+
+        self.patch_embed = Qwen3VLMoeVisionPatchEmbed(
+            config=config,
+        )
+
+        self.pos_embed = nn.Embedding(config.num_position_embeddings, config.hidden_size)
+        self.num_grid_per_side = int(config.num_position_embeddings**0.5)
+
+        head_dim = config.hidden_size // config.num_heads
+        self.rotary_pos_emb = Qwen3VLMoeVisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList([Qwen3VLMoeVisionBlock(config) for _ in range(config.depth)])
+        self.merger = Qwen3VLMoeVisionPatchMerger(
+            config=config,
+            use_postshuffle_norm=False,
+        )
+
+        self.deepstack_visual_indexes = config.deepstack_visual_indexes
+        self.deepstack_merger_list = nn.ModuleList(
+            [
+                Qwen3VLMoeVisionPatchMerger(
+                    config=config,
+                    use_postshuffle_norm=True,
+                )
+                for _ in range(len(config.deepstack_visual_indexes))
+            ]
+        )
+
+        self.gradient_checkpointing = False
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        merge_size = self.spatial_merge_size
+
+        max_hw = int(grid_thw[:, 1:].max().item())
+        freq_table = self.rotary_pos_emb(max_hw)  # (max_hw, dim // 2)
+        device = freq_table.device
+
+        total_tokens = int(torch.prod(grid_thw, dim=1).sum().item())
+        pos_ids = torch.empty((total_tokens, 2), dtype=torch.long, device=device)
+
+        offset = 0
+        for num_frames, height, width in grid_thw:
+            merged_h, merged_w = height // merge_size, width // merge_size
+
+            block_rows = torch.arange(merged_h, device=device)  # block row indices
+            block_cols = torch.arange(merged_w, device=device)  # block col indices
+            intra_row = torch.arange(merge_size, device=device)  # intra-block row offsets
+            intra_col = torch.arange(merge_size, device=device)  # intra-block col offsets
+
+            # Compute full-resolution positions
+            row_idx = block_rows[:, None, None, None] * merge_size + intra_row[None, None, :, None]
+            col_idx = block_cols[None, :, None, None] * merge_size + intra_col[None, None, None, :]
+
+            row_idx = row_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)
+            col_idx = col_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)
+
+            coords = torch.stack((row_idx, col_idx), dim=-1)
+
+            if num_frames > 1:
+                coords = coords.repeat(num_frames, 1)
+
+            num_tokens = coords.shape[0]
+            pos_ids[offset : offset + num_tokens] = coords
+            offset += num_tokens
+
+        embeddings = freq_table[pos_ids]  # lookup rotary embeddings
+        embeddings = embeddings.flatten(1)
+        return embeddings
+
+    def fast_pos_embed_interpolate(self, grid_thw):
+        grid_ts, grid_hs, grid_ws = grid_thw[:, 0], grid_thw[:, 1], grid_thw[:, 2]
+
+        idx_list = [[] for _ in range(4)]
+        weight_list = [[] for _ in range(4)]
+
+        for t, h, w in zip(grid_ts, grid_hs, grid_ws):
+            h_idxs = torch.linspace(0, self.num_grid_per_side - 1, h)
+            w_idxs = torch.linspace(0, self.num_grid_per_side - 1, w)
+
+            h_idxs_floor = h_idxs.int()
+            w_idxs_floor = w_idxs.int()
+            h_idxs_ceil = (h_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)
+            w_idxs_ceil = (w_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)
+
+            dh = h_idxs - h_idxs_floor
+            dw = w_idxs - w_idxs_floor
+
+            base_h = h_idxs_floor * self.num_grid_per_side
+            base_h_ceil = h_idxs_ceil * self.num_grid_per_side
+
+            indices = [
+                (base_h[None].T + w_idxs_floor[None]).flatten(),
+                (base_h[None].T + w_idxs_ceil[None]).flatten(),
+                (base_h_ceil[None].T + w_idxs_floor[None]).flatten(),
+                (base_h_ceil[None].T + w_idxs_ceil[None]).flatten(),
+            ]
+
+            weights = [
+                ((1 - dh)[None].T * (1 - dw)[None]).flatten(),
+                ((1 - dh)[None].T * dw[None]).flatten(),
+                (dh[None].T * (1 - dw)[None]).flatten(),
+                (dh[None].T * dw[None]).flatten(),
+            ]
+
+            for i in range(4):
+                idx_list[i].extend(indices[i].tolist())
+                weight_list[i].extend(weights[i].tolist())
+
+        idx_tensor = torch.tensor(idx_list, dtype=torch.long, device=self.pos_embed.weight.device)
+        weight_tensor = torch.tensor(
+            weight_list, dtype=self.pos_embed.weight.dtype, device=self.pos_embed.weight.device
+        )
+        pos_embeds = self.pos_embed(idx_tensor) * weight_tensor[:, :, None]
+        patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3]
+
+        patch_pos_embeds = patch_pos_embeds.split([h * w for h, w in zip(grid_hs, grid_ws)])
+
+        patch_pos_embeds_permute = []
+        merge_size = self.config.spatial_merge_size
+        for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws):
+            pos_embed = pos_embed.repeat(t, 1)
+            pos_embed = (
+                pos_embed.view(t, h // merge_size, merge_size, w // merge_size, merge_size, -1)
+                .permute(0, 1, 3, 2, 4, 5)
+                .flatten(0, 4)
+            )
+            patch_pos_embeds_permute.append(pos_embed)
+        patch_pos_embeds = torch.cat(patch_pos_embeds_permute)
+        return patch_pos_embeds
+
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
+                The final hidden states of the model.
+            grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
+                The temporal, height and width of feature shape of each image in LLM.
+
+        Returns:
+            `torch.Tensor`: hidden_states.
+        """
+        hidden_states = self.patch_embed(hidden_states)
+
+        pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
+        hidden_states = hidden_states + pos_embeds
+
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+
+        seq_len, _ = hidden_states.size()
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852 for more information
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        deepstack_feature_lists = []
+        for layer_num, blk in enumerate(self.blocks):
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+            if layer_num in self.deepstack_visual_indexes:
+                deepstack_feature = self.deepstack_merger_list[self.deepstack_visual_indexes.index(layer_num)](
+                    hidden_states
+                )
+                deepstack_feature_lists.append(deepstack_feature)
+
+        hidden_states = self.merger(hidden_states)
+
+        return hidden_states, deepstack_feature_lists
+
+
+class Qwen3VLMoeTextRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Qwen3VLMoeTextConfig, device=None):
+        super().__init__()
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", "default")
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+        self.mrope_section = config.rope_scaling.get("mrope_section", [24, 20, 20])
+
+    def apply_interleaved_mrope(self, freqs, mrope_section):
+        """Apply interleaved MRoPE to 3D rotary embeddings.
+        Reorganizes frequency layout from chunked [TTT...HHH...WWW] to
+        interleaved [THTHWHTHW...TT], preserving frequency continuity.
+        args:
+            x: (3, bs, seq_len, head_dim // 2)
+            mrope_section: (3,)
+        returns:
+            x_t: (bs, seq_len, head_dim // 2)
+        """
+        freqs_t = freqs[0]  # just overwrite the first dimension T
+        for dim, offset in enumerate((1, 2), start=1):  # H, W
+            length = mrope_section[dim] * 3
+            idx = slice(offset, length, 3)
+            freqs_t[..., idx] = freqs[dim, ..., idx]
+        return freqs_t
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        # In contrast to other models, Qwen3VLMoe has different position ids for the grids
+        # So we expand the inv_freq to shape (3, ...)
+        if position_ids.ndim == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+        inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
+        position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
+            freqs = self.apply_interleaved_mrope(freqs, self.mrope_section)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring(
+    custom_intro=(
+        "Text part of Qwen3VLMoe, "
+        "not a pure text-only model, as DeepStack integrates visual features into the early hidden states."
+    )
+)
+class Qwen3VLMoeTextModel(Qwen3VLMoePreTrainedModel):
+    config: Qwen3VLMoeTextConfig
+    _no_split_modules = ["Qwen3VLMoeTextDecoderLayer"]
+
+    def __init__(self, config: Qwen3VLMoeTextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen3VLMoeTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Qwen3VLMoeTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen3VLMoeTextRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        # args for deepstack
+        visual_pos_masks: Optional[torch.Tensor] = None,
+        deepstack_visual_embeds: Optional[list[torch.Tensor]] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        r"""
+        visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*):
+            The mask of the visual positions.
+        deepstack_visual_embeds (`list[torch.Tensor]`, *optional*):
+            The deepstack visual embeddings. The shape is (num_layers, visual_seqlen, embed_dim).
+            The feature is extracted from the different visual encoder layers, and fed to the decoder
+            hidden states. It's from the paper DeepStack(https://arxiv.org/abs/2406.04334).
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        # torch.jit.trace() doesn't support cache objects in the output
+        if use_cache and past_key_values is None and not torch.jit.is_tracing():
+            past_key_values = DynamicCache(config=self.config)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        # the hard coded `3` is for temporal, height and width.
+        if position_ids is None:
+            position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
+        elif position_ids.ndim == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+
+        if position_ids.ndim == 3 and position_ids.shape[0] == 4:
+            text_position_ids = position_ids[0]
+            position_ids = position_ids[1:]
+        else:
+            text_position_ids = position_ids[0]
+
+        attention_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=text_position_ids,
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        for layer_idx, decoder_layer in enumerate(self.layers):
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=text_position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+            hidden_states = layer_outputs
+
+            # add visual features to the hidden states of first several layers
+            if deepstack_visual_embeds is not None and layer_idx in range(len(deepstack_visual_embeds)):
+                hidden_states = self._deepstack_process(
+                    hidden_states,
+                    visual_pos_masks,
+                    deepstack_visual_embeds[layer_idx],
+                )
+
+        hidden_states = self.norm(hidden_states)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+    def _deepstack_process(
+        self, hidden_states: torch.Tensor, visual_pos_masks: torch.Tensor, visual_embeds: torch.Tensor
+    ):
+        visual_pos_masks = visual_pos_masks.to(hidden_states.device)
+        visual_embeds = visual_embeds.to(hidden_states.device, hidden_states.dtype)
+        local_this = hidden_states[visual_pos_masks, :].clone() + visual_embeds
+        hidden_states[visual_pos_masks, :] = local_this
+        return hidden_states
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Qwen3VLMoe causal language model (or autoregressive) outputs.
+    """
+)
+class Qwen3VLMoeCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+        The rope index difference between sequence length and multimodal rope.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+    aux_loss: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Llava outputs, with hidden states and attentions.
+    """
+)
+class Qwen3VLMoeModelOutputWithPast(ModelOutput):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+        The rope index difference between sequence length and multimodal rope.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+
+
+@auto_docstring
+class Qwen3VLMoeModel(Qwen3VLMoePreTrainedModel):
+    base_model_prefix = ""
+    _checkpoint_conversion_mapping = {}
+    # Reference: fix gemma3 grad acc #37208
+    accepts_loss_kwargs = False
+    config: Qwen3VLMoeConfig
+    _no_split_modules = ["Qwen3VLMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.visual = Qwen3VLMoeVisionModel._from_config(config.vision_config)
+        self.language_model = Qwen3VLMoeTextModel._from_config(config.text_config)
+        self.rope_deltas = None  # cache rope_deltas here
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def get_rope_index(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Different from the original implementation, Qwen3VLMoe use timestamps rather than absolute time position ids."""
+
+        # Since we use timestamps to seperate videos, like <t1> <vision_start> <frame1> <vision_end> <t2> <vision_start> <frame2> <vision_end>, the video_grid_thw should also be split
+        if video_grid_thw is not None:
+            video_grid_thw = torch.repeat_interleave(video_grid_thw, video_grid_thw[:, 0], dim=0)
+            video_grid_thw[:, 0] = 1
+
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+        mrope_position_deltas = []
+        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_index, video_index = 0, 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                image_nums, video_nums = 0, 0
+                vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+                vision_tokens = input_ids[vision_start_indices + 1]
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (vision_tokens == video_token_id).sum()
+                input_tokens = input_ids.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos = image_nums, video_nums
+                for _ in range(image_nums + video_nums):
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if ed_image < ed_video:
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        image_index += 1
+                        remain_images -= 1
+                        ed = ed_image
+
+                    else:
+                        t, h, w = (
+                            video_grid_thw[video_index][0],
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        video_index += 1
+                        remain_videos -= 1
+                        ed = ed_video
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_merge_size,
+                        w.item() // spatial_merge_size,
+                    )
+                    text_len = ed - st
+
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                    # t_index is always 0 because llm_grid_t is always 1 (we use timestamps to encode the temporal information for videos)
+                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+                if st < len(input_tokens):
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    torch.arange(input_ids.shape[1], device=input_ids.device)
+                    .view(1, 1, -1)
+                    .expand(3, input_ids.shape[0], -1)
+                )
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+
+            return position_ids, mrope_position_deltas
+
+    def get_video_features(
+        self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
+    ):
+        """
+        Encodes videos into continuous embeddings that can be forwarded to the language model. The deepstack visual features are also returned.
+
+        Args:
+            pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input videos.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+        """
+        # Same implementation as for images
+        return self.get_image_features(pixel_values_videos, video_grid_thw)
+
+    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model. The deepstack visual features are also returned.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+        """
+        pixel_values = pixel_values.type(self.visual.dtype)
+        image_embeds, deepstack_image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+        split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
+        image_embeds = torch.split(image_embeds, split_sizes)
+        return image_embeds, deepstack_image_embeds
+
+    def get_placeholder_mask(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor,
+        image_features: Optional[torch.FloatTensor] = None,
+        video_features: Optional[torch.FloatTensor] = None,
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+            special_video_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_video_mask = special_video_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+            special_video_mask = input_ids == self.config.video_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
+            )
+
+        n_video_tokens = special_video_mask.sum()
+        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
+            raise ValueError(
+                f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}"
+            )
+
+        return special_image_mask, special_video_mask
+
+    @auto_docstring
+    @check_model_inputs
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Qwen3VLMoeModelOutputWithPast]:
+        r"""
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        image_mask = None
+        video_mask = None
+
+        if pixel_values is not None:
+            image_embeds, deepstack_image_embeds = self.get_image_features(pixel_values, image_grid_thw)
+            image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            image_mask, _ = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+
+        if pixel_values_videos is not None:
+            video_embeds, deepstack_video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
+            video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            _, video_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+
+        visual_pos_masks = None
+        deepstack_visual_embeds = None
+        if image_mask is not None and video_mask is not None:
+            # aggregate visual_pos_masks and deepstack_visual_embeds
+            image_mask = image_mask[..., 0]
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = image_mask | video_mask
+            deepstack_visual_embeds = []
+            image_mask_joint = image_mask[visual_pos_masks]
+            video_mask_joint = video_mask[visual_pos_masks]
+            for img_embed, vid_embed in zip(deepstack_image_embeds, deepstack_video_embeds):
+                embed_joint = img_embed.new_zeros(visual_pos_masks.sum(), img_embed.shape[-1]).to(img_embed.device)
+                embed_joint[image_mask_joint, :] = img_embed
+                embed_joint[video_mask_joint, :] = vid_embed
+                deepstack_visual_embeds.append(embed_joint)
+        elif image_mask is not None:
+            image_mask = image_mask[..., 0]
+            visual_pos_masks = image_mask
+            deepstack_visual_embeds = deepstack_image_embeds
+        elif video_mask is not None:
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = video_mask
+            deepstack_visual_embeds = deepstack_video_embeds
+
+        if position_ids is None:
+            attention_mask_tensor = (
+                attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"]
+            )
+            if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
+                attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2)
+                # Only apply conversion for floating point tensors (inverted masks)
+                if attention_mask_tensor.dtype.is_floating_point:
+                    attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
+                    attention_mask_tensor = (1.0 - attention_mask_tensor).int()
+
+            # Calculate RoPE index once per generation in the pre-fill stage only.
+            # When compiling, we can't check tensor values thus we check only input length
+            # It is safe to assume that `length!=1` means we're in pre-fill because compiled
+            # models currently cannot do asssisted decoding
+            prefill_compiled_stage = is_torchdynamo_compiling() and (
+                (input_ids is not None and input_ids.shape[1] != 1)
+                or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
+            )
+            prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
+                (cache_position is not None and cache_position[0] == 0)
+                or (past_key_values is None or past_key_values.get_seq_length() == 0)
+            )
+            if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    attention_mask=attention_mask_tensor,
+                )
+                self.rope_deltas = rope_deltas
+            # then use the prev pre-calculated rope-deltas to get the correct position ids
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = (
+                    (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
+                    if cache_position is not None
+                    else 0
+                )
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+
+        outputs = self.language_model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            visual_pos_masks=visual_pos_masks,
+            deepstack_visual_embeds=deepstack_visual_embeds,
+            **kwargs,
+        )
+
+        return Qwen3VLMoeModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            rope_deltas=self.rope_deltas,
+        )
+
+
+def load_balancing_loss_func(
+    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
+    num_experts: Optional[int] = None,
+    top_k=2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, int]:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits:
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts:
+            Number of experts
+        top_k:
+            The number of experts to route per-token, can be also interpreted as the `top-k` routing
+            parameter.
+        attention_mask (`torch.Tensor`, *optional*):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+
+
+class Qwen3VLMoeForConditionalGeneration(Qwen3VLMoePreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = ["lm_head.weight"]
+    # Reference: fix gemma3 grad acc #37208
+    accepts_loss_kwargs = False
+    config: Qwen3VLMoeConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen3VLMoeModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_video_features(
+        self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
+    ):
+        return self.model.get_video_features(pixel_values_videos, video_grid_thw)
+
+    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
+        return self.model.get_image_features(pixel_values, image_grid_thw)
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def visual(self):
+        return self.model.visual
+
+    @check_model_inputs
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Qwen3VLMoeCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration
+
+        >>> model = Qwen3VLMoeForConditionalGeneration.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct", dtype="auto", device_map="auto")
+        >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct")
+
+        >>> messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                    },
+                    {"type": "text", "text": "Describe this image in short."},
+                ],
+            }
+        ]
+
+        >>> # Preparation for inference
+        >>> inputs = processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt"
+        )
+        >>> inputs = inputs.to(model.device)
+
+        >>> # Generate
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=128)
+        >>> generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        >>> processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "A woman in a plaid shirt sits on a sandy beach at sunset, smiling as she gives a high-five to a yellow Labrador Retriever wearing a harness. The ocean waves roll in the background."
+        ```"""
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
+
+        aux_loss = None
+        if kwargs.get("output_router_logits", False):
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits,
+                self.config.text_config.num_experts,
+                self.config.text_config.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.config.text_config.router_aux_loss_coef * aux_loss.to(
+                    loss.device
+                )  # make sure to reside in the same device
+
+        return Qwen3VLMoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            rope_deltas=outputs.rope_deltas,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            use_cache=use_cache,
+            **kwargs,
+        )
+
+        # Qwen3VLMoe position_ids are prepareed with rope_deltas in forward
+        model_inputs["position_ids"] = None
+
+        if cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_values_videos"] = None
+
+        return model_inputs
+
+    def _get_image_nums_and_video_nums(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
+        These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+        Returns:
+            image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
+            video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
+        """
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+
+        if inputs_embeds is not None:
+            vision_start_mask = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(vision_start_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+            image_mask = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(image_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+            video_mask = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(video_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+        else:
+            vision_start_mask = input_ids == vision_start_token_id
+            image_mask = input_ids == image_token_id
+            video_mask = input_ids == video_token_id
+
+        vision_first_mask = torch.roll(vision_start_mask, shifts=1, dims=1)
+        image_nums = torch.sum(vision_first_mask & image_mask, dim=1)
+        video_nums = torch.sum(vision_first_mask & video_mask, dim=1)
+
+        return image_nums, video_nums
+
+    def _expand_inputs_for_generation(
+        self,
+        expand_size: int = 1,
+        is_encoder_decoder: bool = False,
+        input_ids: Optional[torch.LongTensor] = None,
+        **model_kwargs,
+    ) -> tuple[torch.LongTensor, dict[str, Any]]:
+        # Overwritten -- Support for expanding tensors without a batch size dimension
+        # e.g., pixel_values, image_grid_thw, pixel_values_videos, video_grid_thw, second_per_grid_t
+        # pixel_values.shape[0] is sum(seqlen_images for samples)
+        # image_grid_thw.shape[0] is sum(num_images for samples)
+
+        if expand_size == 1:
+            return input_ids, model_kwargs
+
+        visual_keys = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw", "second_per_grid_ts"]
+
+        def _expand_dict_for_generation_visual(dict_to_expand):
+            image_grid_thw = model_kwargs.get("image_grid_thw", None)
+            video_grid_thw = model_kwargs.get("video_grid_thw", None)
+            image_nums, video_nums = self._get_image_nums_and_video_nums(
+                input_ids, inputs_embeds=model_kwargs.get("inputs_embeds", None)
+            )
+
+            def _repeat_interleave_samples(x, lengths, repeat_times):
+                samples = torch.split(x, lengths)
+                repeat_args = [repeat_times] + [1] * (x.dim() - 1)
+                result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0)
+                return result
+
+            for key in dict_to_expand:
+                if key == "pixel_values":
+                    # split images into samples
+                    samples = torch.split(image_grid_thw, list(image_nums))
+                    # compute the sequence length of images for each sample
+                    lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "image_grid_thw":
+                    # get the num of images for each sample
+                    lengths = list(image_nums)
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "pixel_values_videos":
+                    samples = torch.split(video_grid_thw, list(video_nums))
+                    lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "video_grid_thw":
+                    lengths = list(video_nums)
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "second_per_grid_ts":
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=list(video_nums), repeat_times=expand_size
+                    )
+            return dict_to_expand
+
+        def _expand_dict_for_generation(dict_to_expand):
+            for key in dict_to_expand:
+                if (
+                    key != "cache_position"
+                    and dict_to_expand[key] is not None
+                    and isinstance(dict_to_expand[key], torch.Tensor)
+                    and key not in visual_keys
+                ):
+                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
+            return dict_to_expand
+
+        model_kwargs = _expand_dict_for_generation_visual(model_kwargs)
+
+        if input_ids is not None:
+            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
+
+        model_kwargs = _expand_dict_for_generation(model_kwargs)
+
+        if is_encoder_decoder:
+            if model_kwargs.get("encoder_outputs") is None:
+                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
+            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
+
+        return input_ids, model_kwargs
+
+
+__all__ = [
+    "Qwen3VLMoeVisionModel",
+    "Qwen3VLMoeForConditionalGeneration",
+    "Qwen3VLMoeModel",
+    "Qwen3VLMoePreTrainedModel",
+    "Qwen3VLMoeTextModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..72d3452bdc50a244b6cf3fdd9bd76dda5b2fc0f4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py
@@ -0,0 +1,551 @@
+# coding=utf-8
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Qwen3-VL-MOE model."""
+
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, logging
+from ..qwen3_moe.modeling_qwen3_moe import (
+    Qwen3MoeDecoderLayer,
+    Qwen3MoePreTrainedModel,
+    Qwen3MoeRMSNorm,
+    load_balancing_loss_func,
+)
+from ..qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig, Qwen3VLVisionConfig
+from ..qwen3_vl.modeling_qwen3_vl import (
+    Qwen3VLCausalLMOutputWithPast,
+    Qwen3VLForConditionalGeneration,
+    Qwen3VLModel,
+    Qwen3VLTextAttention,
+    Qwen3VLTextModel,
+    Qwen3VLVisionModel,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+class Qwen3VLMoeTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3VLMoeTextModel`]. It is used to instantiate a
+    Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3-VL-30B-A3B-Instruct [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2MoE model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2MoeModel`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 5632):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 128000):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 5000000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        decoder_sparse_step (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer.
+        moe_intermediate_size (`int`, *optional*, defaults to 1408):
+            Intermediate size of the routed expert.
+        num_experts_per_tok (`int`, *optional*, defaults to 4):
+            Number of selected experts.
+        num_experts (`int`, *optional*, defaults to 60):
+            Number of routed experts.
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the topk probabilities.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        mlp_only_layers (`List[int]`, *optional*, defaults to `[]`):
+            Indicate which layers use Qwen3VLMoeMLP rather than Qwen3VLMoeSparseMoeBlock
+            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
+            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        head_dim (`int`, *optional*):
+            The dimension of the head. If not specified, will default to `hidden_size // num_attention_heads`.
+
+    ```python
+    >>> from transformers import Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeConfig
+
+    >>> # Initializing a Qwen3VLMoe style configuration
+    >>> configuration = Qwen3VLMoeConfig()
+
+    >>> # Initializing a model from the Qwen3-VL-30B-A3B style configuration
+    >>> model = Qwen3VLMoeForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_vl_moe_text"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Qwen3VLMoe`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=2048,
+        intermediate_size=5632,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        hidden_act="silu",
+        max_position_embeddings=128000,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=5000000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        decoder_sparse_step=1,
+        moe_intermediate_size=1408,
+        num_experts_per_tok=4,
+        num_experts=60,
+        norm_topk_prob=True,
+        router_aux_loss_coef=0.001,
+        mlp_only_layers=None,
+        rope_scaling=None,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+        self.head_dim = head_dim or hidden_size // num_attention_heads
+
+        rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
+
+        # MoE arguments
+        self.decoder_sparse_step = decoder_sparse_step
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.norm_topk_prob = norm_topk_prob
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+class Qwen3VLMoeVisionConfig(Qwen3VLVisionConfig):
+    pass
+
+
+class Qwen3VLMoeConfig(Qwen3VLConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3VLMoeModel`]. It is used to instantiate a
+    Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3-VL-30B-A3B-Instruct [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLMoeTextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Qwen3VLMoeVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 151655):
+            The image token index to encode the image prompt.
+        video_token_id (`int`, *optional*, defaults to 151656):
+            The video token index to encode the image prompt.
+        vision_start_token_id (`int`, *optional*, defaults to 151652):
+            The start token index to encode the image prompt.
+        vision_end_token_id (`int`, *optional*, defaults to 151653):
+            The end token index to encode the image prompt.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie the word embeddings.
+
+    ```python
+    >>> from transformers import Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeConfig
+
+    >>> # Initializing a Qwen3-VL-MOE style configuration
+    >>> configuration = Qwen3VLMoeConfig()
+
+    >>> # Initializing a model from the Qwen3-VL-30B-A3B style configuration
+    >>> model = Qwen3VLMoeForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_vl_moe"
+    sub_configs = {"vision_config": Qwen3VLMoeVisionConfig, "text_config": Qwen3VLMoeTextConfig}
+
+
+class Qwen3VLMoeTextRMSNorm(Qwen3MoeRMSNorm):
+    pass
+
+
+class Qwen3VLMoeTextExperts(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.intermediate_size = config.moe_intermediate_size
+        self.hidden_size = config.hidden_size
+        self.expert_dim = self.intermediate_size
+        self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
+        self.down_proj = nn.Parameter(torch.empty((self.num_experts, self.expert_dim, self.hidden_size)))
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(
+        self, hidden_states: torch.Tensor, routing_weights: torch.Tensor, router_indices: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        When training it is more efficient to just loop over the experts and compute the output for each expert
+        as otherwise the memory would explode.
+
+        For inference we can sacrifice some memory and compute the output for all experts at once. By repeating the inputs.
+
+        Args:
+            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
+            routing_weights (torch.Tensor): (batch_size * token_num, num_experts)
+            router_indices (torch.Tensor): (batch_size * token_num, top_k)
+        Returns:
+            torch.Tensor
+        """
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)  # (num_tokens, hidden_size)
+        if self.training:
+            next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
+            with torch.no_grad():
+                expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=self.num_experts)
+                expert_mask = expert_mask.permute(2, 1, 0)
+                # we sum on the top_k and on the sequence length to get which experts
+                # are hit this time around
+                expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            for expert_idx in expert_hit[:]:
+                with torch.no_grad():
+                    _, token_idx = torch.where(expert_mask[expert_idx[0]])
+                current_state = hidden_states[token_idx]
+                gate_up = current_state @ self.gate_up_proj[expert_idx]
+                gate, up = gate_up.chunk(2, dim=-1)
+                gated_output = up * self.act_fn(gate)
+                out = gated_output @ self.down_proj[expert_idx]
+                weighted_output = out[0] * routing_weights[token_idx, expert_idx, None]
+                next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
+            next_states = next_states.view(batch_size, -1, self.hidden_size)
+        else:
+            hidden_states = hidden_states.repeat(self.num_experts, 1)
+            hidden_states = hidden_states.view(self.num_experts, -1, self.hidden_size)
+            gate_up = torch.bmm(hidden_states, self.gate_up_proj)
+            gate, up = gate_up.chunk(2, dim=-1)  # not supported for DTensors
+            next_states = torch.bmm((up * self.act_fn(gate)), self.down_proj)
+            next_states = next_states.reshape(self.num_experts, batch_size, -1, self.hidden_size)
+            next_states = (
+                next_states * routing_weights.transpose(0, 1).view(self.num_experts, batch_size, -1)[..., None]
+            )
+            next_states = next_states.sum(dim=0)
+        return next_states
+
+
+class Qwen3VLMoeTextSparseMoeBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
+        self.experts = Qwen3VLMoeTextExperts(config)
+
+        # since all the models use norm_topk_prob, we don't need to have a extra check for it
+        # self.norm_topk_prob = config.norm_topk_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)
+        router_logits = self.gate(hidden_states)
+        routing_weights = torch.nn.functional.softmax(router_logits, dim=-1, dtype=torch.float)
+        routing_weights, router_indices = torch.topk(routing_weights, self.top_k, dim=-1)
+        routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        router_weights = torch.zeros_like(router_logits).scatter_(1, router_indices, routing_weights)
+        hidden_states = hidden_states.reshape(batch_size, -1, self.hidden_size)
+        routed_out = self.experts(hidden_states, router_weights, router_indices)
+        return routed_out, router_logits
+
+
+class Qwen3VLMoeTextAttention(Qwen3VLTextAttention):
+    pass
+
+
+class Qwen3VLMoeTextDecoderLayer(Qwen3MoeDecoderLayer):
+    pass
+
+
+class Qwen3VLMoePreTrainedModel(Qwen3MoePreTrainedModel):
+    config: Qwen3VLMoeConfig
+    _no_split_modules = ["Qwen3VLMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"]
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        PreTrainedModel._init_weights(self, module)
+        if hasattr(self.config, "initializer_range"):
+            std = self.config.initializer_range
+        else:
+            std = getattr(self.config.get_text_config(), "initializer_range", 0.02)
+        if isinstance(module, Qwen3VLMoeTextExperts):
+            module.gate_up_proj.data.normal_(mean=0.0, std=std)
+            module.down_proj.data.normal_(mean=0.0, std=std)
+
+
+class Qwen3VLMoeVisionModel(Qwen3VLVisionModel):
+    pass
+
+
+class Qwen3VLMoeTextModel(Qwen3VLTextModel):
+    pass
+
+
+class Qwen3VLMoeCausalLMOutputWithPast(Qwen3VLCausalLMOutputWithPast):
+    aux_loss: Optional[torch.FloatTensor] = None
+
+
+class Qwen3VLMoeModel(Qwen3VLModel):
+    pass
+
+
+class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration
+
+        >>> model = Qwen3VLMoeForConditionalGeneration.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct", dtype="auto", device_map="auto")
+        >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-30B-A3B-Instruct")
+
+        >>> messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                    },
+                    {"type": "text", "text": "Describe this image in short."},
+                ],
+            }
+        ]
+
+        >>> # Preparation for inference
+        >>> inputs = processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt"
+        )
+        >>> inputs = inputs.to(model.device)
+
+        >>> # Generate
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=128)
+        >>> generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        >>> processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "A woman in a plaid shirt sits on a sandy beach at sunset, smiling as she gives a high-five to a yellow Labrador Retriever wearing a harness. The ocean waves roll in the background."
+        ```"""
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
+
+        aux_loss = None
+        if kwargs.get("output_router_logits", False):
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits,
+                self.config.text_config.num_experts,
+                self.config.text_config.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.config.text_config.router_aux_loss_coef * aux_loss.to(
+                    loss.device
+                )  # make sure to reside in the same device
+
+        return Qwen3VLMoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            rope_deltas=outputs.rope_deltas,
+        )
+
+
+__all__ = [
+    "Qwen3VLMoeConfig",
+    "Qwen3VLMoeTextConfig",
+    "Qwen3VLMoeVisionModel",
+    "Qwen3VLMoeForConditionalGeneration",
+    "Qwen3VLMoeModel",
+    "Qwen3VLMoePreTrainedModel",
+    "Qwen3VLMoeTextModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd0ed7feaa3806121b285691d022704634debce2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/configuration_reformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/configuration_reformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca0fffb7461d8c7d244adcd9270c0455cec52de1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/configuration_reformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/tokenization_reformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/tokenization_reformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f2e1233c4774c95493b660f839c0b50293432e2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/tokenization_reformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/tokenization_reformer_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/tokenization_reformer_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fb9bc2235392dfe63137cdd35344289a81e4b89
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/reformer/__pycache__/tokenization_reformer_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1713e724ff58b2b0ad8a3815706cea07aaa17dd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/configuration_regnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/configuration_regnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..371ebcc02ee7b79c65bc12dc8046e5c35e7b1351
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/configuration_regnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_flax_regnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_flax_regnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..915a2519b1a18d93d0b9f3aef570e3d7c640187e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_flax_regnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_regnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_regnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d388f68edf43587e6a28bf24036ccb82343ee1ff
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_regnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_tf_regnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_tf_regnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13fe8b4cf544222c150caec042c8af3ea56a785d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/regnet/__pycache__/modeling_tf_regnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a288723a1c31abf17b4fbcb64c3a2f11cbebfa4b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/configuration_rembert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/configuration_rembert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7ea64fd2977c97d91568e672546fbde06011801
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/configuration_rembert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/modeling_rembert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/modeling_rembert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6565ef8bcf4fbae45c4d06bd7fa211b603d9d6d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/modeling_rembert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/modeling_tf_rembert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/modeling_tf_rembert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c807b71634985914c1ea740037b13510a210b9d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/modeling_tf_rembert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ba9be42196bcac1c0fef10428fe99d610fa9ab3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20cde5e6535ef7021d0056ecf5164ac715863bd8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc0b9abd4763a724917b9430c6c16cfd0d73b5cb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/configuration_resnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/configuration_resnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c6849c3e0576ac5a7d475c504e9114b56c917e2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/configuration_resnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_flax_resnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_flax_resnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b303fb5e9692991cd79ecd3f7f8ff331c1a89f3c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_flax_resnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_resnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_resnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73a2f1e4bd6dbee855f897c49d447632b5415977
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_resnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_tf_resnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_tf_resnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..636fb1aebfc3795915444c1c6ce516789965f0c8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/resnet/__pycache__/modeling_tf_resnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ad2eff844ebc864fb3feb7389a32c689042a52c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/configuration_roberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/configuration_roberta.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..581483bd4e47c4cdb064244879a9f82c22b12bfa
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/configuration_roberta.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_flax_roberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_flax_roberta.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86b2572e8a9e15f2c3e80aee3e3cd285bf4538d2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_flax_roberta.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_roberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_roberta.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..956ea996744a4d6dfa56f5c85d73d40caebb33e6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_roberta.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_tf_roberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_tf_roberta.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc32caa61d8404598e17fff31c4838627f547d31
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/modeling_tf_roberta.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/tokenization_roberta.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/tokenization_roberta.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ca8ce490f2f45275fa21c6214cbe584f23d1d5c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/tokenization_roberta.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/tokenization_roberta_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/tokenization_roberta_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f188b068e1ee5696c234f0d7609bbb7e2442334d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta/__pycache__/tokenization_roberta_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..208878343d24b3cec254a918f59d69841d1110c7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_roberta_prelayernorm import *
+    from .modeling_flax_roberta_prelayernorm import *
+    from .modeling_roberta_prelayernorm import *
+    from .modeling_tf_roberta_prelayernorm import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..72bc808c450d692903bb8826624afb0efb0199d6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py
@@ -0,0 +1,157 @@
+# coding=utf-8
+# Copyright 2022 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RoBERTa-PreLayerNorm configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.roberta.configuration_roberta.RobertaConfig with FacebookAI/roberta-base->andreasmadsen/efficient_mlm_m0.40,RoBERTa->RoBERTa-PreLayerNorm,Roberta->RobertaPreLayerNorm,roberta->roberta-prelayernorm
+class RobertaPreLayerNormConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RobertaPreLayerNormModel`] or a [`TFRobertaPreLayerNormModel`]. It is
+    used to instantiate a RoBERTa-PreLayerNorm model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa-PreLayerNorm
+    [andreasmadsen/efficient_mlm_m0.40](https://huggingface.co/andreasmadsen/efficient_mlm_m0.40) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the RoBERTa-PreLayerNorm model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`RobertaPreLayerNormModel`] or [`TFRobertaPreLayerNormModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`RobertaPreLayerNormModel`] or [`TFRobertaPreLayerNormModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from transformers import RobertaPreLayerNormConfig, RobertaPreLayerNormModel
+
+    >>> # Initializing a RoBERTa-PreLayerNorm configuration
+    >>> configuration = RobertaPreLayerNormConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = RobertaPreLayerNormModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "roberta-prelayernorm"
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+
+
+# Copied from transformers.models.roberta.configuration_roberta.RobertaOnnxConfig with Roberta->RobertaPreLayerNorm
+class RobertaPreLayerNormOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["RobertaPreLayerNormConfig", "RobertaPreLayerNormOnnxConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f65dc07bb165ce1216831243405de26612d26232
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_flax_roberta_prelayernorm.py
@@ -0,0 +1,1527 @@
+# coding=utf-8
+# Copyright 2022 The Google Flax Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flax RoBERTa-PreLayerNorm model."""
+
+from typing import Callable, Optional
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen import partitioning as nn_partitioning
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxBaseModelOutputWithPooling,
+    FlaxBaseModelOutputWithPoolingAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+    FlaxMaskedLMOutput,
+    FlaxMultipleChoiceModelOutput,
+    FlaxQuestionAnsweringModelOutput,
+    FlaxSequenceClassifierOutput,
+    FlaxTokenClassifierOutput,
+)
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring, overwrite_call_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_roberta_prelayernorm import RobertaPreLayerNormConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "andreasmadsen/efficient_mlm_m0.40"
+_CONFIG_FOR_DOC = "RobertaPreLayerNormConfig"
+
+remat = nn_partitioning.remat
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        input_ids: jnp.ndarray
+        padding_idx: int
+
+    Returns: jnp.ndarray
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = (input_ids != padding_idx).astype("i4")
+
+    if mask.ndim > 2:
+        mask = mask.reshape((-1, mask.shape[-1]))
+        incremental_indices = jnp.cumsum(mask, axis=1).astype("i4") * mask
+        incremental_indices = incremental_indices.reshape(input_ids.shape)
+    else:
+        incremental_indices = jnp.cumsum(mask, axis=1).astype("i4") * mask
+
+    return incremental_indices.astype("i4") + padding_idx
+
+
+ROBERTA_PRELAYERNORM_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
+
+    This model is also a
+    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
+    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
+    behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`RobertaPreLayerNormConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
+            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings with Bert->RobertaPreLayerNorm
+class FlaxRobertaPreLayerNormEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.word_embeddings = nn.Embed(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.position_embeddings = nn.Embed(
+            self.config.max_position_embeddings,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.token_type_embeddings = nn.Embed(
+            self.config.type_vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True):
+        # Embed
+        inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
+        position_embeds = self.position_embeddings(position_ids.astype("i4"))
+        token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))
+
+        # Sum all embeddings
+        hidden_states = inputs_embeds + token_type_embeddings + position_embeds
+
+        # Layer Norm
+        hidden_states = self.LayerNorm(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention with Bert->RobertaPreLayerNorm
+class FlaxRobertaPreLayerNormSelfAttention(nn.Module):
+    config: RobertaPreLayerNormConfig
+    causal: bool = False
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.head_dim = self.config.hidden_size // self.config.num_attention_heads
+        if self.config.hidden_size % self.config.num_attention_heads != 0:
+            raise ValueError(
+                "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
+                "                   : {self.config.num_attention_heads}"
+            )
+
+        self.query = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.key = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.value = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
+            )
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,))
+
+    @nn.compact
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slightly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        key_value_states: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+
+        # get query proj
+        query_states = self.query(hidden_states)
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self.key(key_value_states)
+            value_states = self.value(key_value_states)
+        else:
+            # self_attention
+            key_states = self.key(hidden_states)
+            value_states = self.value(hidden_states)
+
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+
+        # handle cache prepare causal attention mask
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+
+        dropout_rng = None
+        if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_probs_dropout_prob,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask)
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+class FlaxRobertaPreLayerNormSelfOutput(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+
+
+class FlaxRobertaPreLayerNormAttention(nn.Module):
+    config: RobertaPreLayerNormConfig
+    causal: bool = False
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.self = FlaxRobertaPreLayerNormSelfAttention(self.config, causal=self.causal, dtype=self.dtype)
+        self.output = FlaxRobertaPreLayerNormSelfOutput(self.config, dtype=self.dtype)
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        key_value_states=None,
+        init_cache=False,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        hidden_states_pre_layer_norm = self.LayerNorm(hidden_states)
+        # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length)
+        # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable
+        # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length)
+        attn_outputs = self.self(
+            hidden_states_pre_layer_norm,
+            attention_mask,
+            layer_head_mask=layer_head_mask,
+            key_value_states=key_value_states,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]
+        hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_outputs[1],)
+
+        return outputs
+
+
+class FlaxRobertaPreLayerNormIntermediate(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dense = nn.Dense(
+            self.config.intermediate_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.activation = ACT2FN[self.config.hidden_act]
+
+    def __call__(self, hidden_states):
+        hidden_states = self.LayerNorm(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class FlaxRobertaPreLayerNormOutput(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, hidden_states, attention_output, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = hidden_states + attention_output
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayer with Bert->RobertaPreLayerNorm
+class FlaxRobertaPreLayerNormLayer(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.attention = FlaxRobertaPreLayerNormAttention(self.config, causal=self.config.is_decoder, dtype=self.dtype)
+        self.intermediate = FlaxRobertaPreLayerNormIntermediate(self.config, dtype=self.dtype)
+        self.output = FlaxRobertaPreLayerNormOutput(self.config, dtype=self.dtype)
+        if self.config.add_cross_attention:
+            self.crossattention = FlaxRobertaPreLayerNormAttention(self.config, causal=False, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+    ):
+        # Self Attention
+        attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            layer_head_mask=layer_head_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+        attention_output = attention_outputs[0]
+
+        # Cross-Attention Block
+        if encoder_hidden_states is not None:
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=layer_head_mask,
+                key_value_states=encoder_hidden_states,
+                deterministic=deterministic,
+                output_attentions=output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+
+        hidden_states = self.intermediate(attention_output)
+        hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attention_outputs[1],)
+            if encoder_hidden_states is not None:
+                outputs += (cross_attention_outputs[1],)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection with Bert->RobertaPreLayerNorm
+class FlaxRobertaPreLayerNormLayerCollection(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        if self.gradient_checkpointing:
+            FlaxRobertaPreLayerNormCheckpointLayer = remat(FlaxRobertaPreLayerNormLayer, static_argnums=(5, 6, 7))
+            self.layers = [
+                FlaxRobertaPreLayerNormCheckpointLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.num_hidden_layers)
+            ]
+        else:
+            self.layers = [
+                FlaxRobertaPreLayerNormLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.num_hidden_layers)
+            ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        # Check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.shape[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for                  "
+                    f"       {head_mask.shape[0]}."
+                )
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = layer(
+                hidden_states,
+                attention_mask,
+                head_mask[i] if head_mask is not None else None,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                init_cache,
+                deterministic,
+                output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions)
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEncoder with Bert->RobertaPreLayerNorm
+class FlaxRobertaPreLayerNormEncoder(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.layer = FlaxRobertaPreLayerNormLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return self.layer(
+            hidden_states,
+            attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPooler with Bert->RobertaPreLayerNorm
+class FlaxRobertaPreLayerNormPooler(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, hidden_states):
+        cls_hidden_state = hidden_states[:, 0]
+        cls_hidden_state = self.dense(cls_hidden_state)
+        return nn.tanh(cls_hidden_state)
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaLMHead with Roberta->RobertaPreLayerNorm
+class FlaxRobertaPreLayerNormLMHead(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32
+    bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.decoder = nn.Dense(
+            self.config.vocab_size,
+            dtype=self.dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))
+
+    def __call__(self, hidden_states, shared_embedding=None):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = ACT2FN["gelu"](hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+
+        if shared_embedding is not None:
+            hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            hidden_states = self.decoder(hidden_states)
+
+        bias = jnp.asarray(self.bias, self.dtype)
+        hidden_states += bias
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaClassificationHead with Roberta->RobertaPreLayerNorm
+class FlaxRobertaPreLayerNormClassificationHead(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        classifier_dropout = (
+            self.config.classifier_dropout
+            if self.config.classifier_dropout is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(rate=classifier_dropout)
+        self.out_proj = nn.Dense(
+            self.config.num_labels,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+    def __call__(self, hidden_states, deterministic=True):
+        hidden_states = hidden_states[:, 0, :]  # take <s> token (equiv. to [CLS])
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = nn.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaPreTrainedModel with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
+class FlaxRobertaPreLayerNormPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RobertaPreLayerNormConfig
+    base_model_prefix = "roberta_prelayernorm"
+
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: RobertaPreLayerNormConfig,
+        input_shape: tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        gradient_checkpointing: bool = False,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing
+    def enable_gradient_checkpointing(self):
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=True,
+        )
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        token_type_ids = jnp.ones_like(input_ids)
+        position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id)
+        attention_mask = jnp.ones_like(input_ids)
+        head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        if self.config.add_cross_attention:
+            encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,))
+            encoder_attention_mask = attention_mask
+            module_init_outputs = self.module.init(
+                rngs,
+                input_ids,
+                attention_mask,
+                token_type_ids,
+                position_ids,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                return_dict=False,
+            )
+        else:
+            module_init_outputs = self.module.init(
+                rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False
+            )
+
+        random_params = module_init_outputs["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderPreTrainedModel.init_cache
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        attention_mask = jnp.ones_like(input_ids, dtype="i4")
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        params: Optional[dict] = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        past_key_values: Optional[dict] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # init input tensors if not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.zeros_like(input_ids)
+
+        if position_ids is None:
+            position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id)
+
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+
+        if head_mask is None:
+            head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        if self.config.add_cross_attention:
+            # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed
+            # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be
+            # changed by FlaxRobertaPreLayerNormAttention module
+            if past_key_values:
+                inputs["cache"] = past_key_values
+                mutable = ["cache"]
+            else:
+                mutable = False
+
+            outputs = self.module.apply(
+                inputs,
+                jnp.array(input_ids, dtype="i4"),
+                jnp.array(attention_mask, dtype="i4"),
+                token_type_ids=jnp.array(token_type_ids, dtype="i4"),
+                position_ids=jnp.array(position_ids, dtype="i4"),
+                head_mask=jnp.array(head_mask, dtype="i4"),
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                deterministic=not train,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                rngs=rngs,
+                mutable=mutable,
+            )
+
+            # add updated cache to model output
+            if past_key_values is not None and return_dict:
+                outputs, past_key_values = outputs
+                outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+                return outputs
+            elif past_key_values is not None and not return_dict:
+                outputs, past_key_values = outputs
+                outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        else:
+            outputs = self.module.apply(
+                inputs,
+                jnp.array(input_ids, dtype="i4"),
+                jnp.array(attention_mask, dtype="i4"),
+                token_type_ids=jnp.array(token_type_ids, dtype="i4"),
+                position_ids=jnp.array(position_ids, dtype="i4"),
+                head_mask=jnp.array(head_mask, dtype="i4"),
+                deterministic=not train,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                rngs=rngs,
+            )
+
+        return outputs
+
+
+class FlaxRobertaPreLayerNormModule(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    add_pooling_layer: bool = True
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.embeddings = FlaxRobertaPreLayerNormEmbeddings(self.config, dtype=self.dtype)
+        self.encoder = FlaxRobertaPreLayerNormEncoder(
+            self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.pooler = FlaxRobertaPreLayerNormPooler(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        head_mask: Optional[jnp.ndarray] = None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # make sure `token_type_ids` is correctly initialized when not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.zeros_like(input_ids)
+
+        # make sure `position_ids` is correctly initialized when not passed
+        if position_ids is None:
+            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        hidden_states = self.embeddings(
+            input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic
+        )
+        outputs = self.encoder(
+            hidden_states,
+            attention_mask,
+            head_mask=head_mask,
+            deterministic=deterministic,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.LayerNorm(hidden_states)
+        pooled = self.pooler(hidden_states) if self.add_pooling_layer else None
+
+        if not return_dict:
+            # if pooled is None, don't return it
+            if pooled is None:
+                return (hidden_states,) + outputs[1:]
+            return (hidden_states, pooled) + outputs[1:]
+
+        return FlaxBaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            pooler_output=pooled,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare RoBERTa-PreLayerNorm Model transformer outputting raw hidden-states without any specific head on top.",
+    ROBERTA_PRELAYERNORM_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaModel with Roberta->RobertaPreLayerNorm
+class FlaxRobertaPreLayerNormModel(FlaxRobertaPreLayerNormPreTrainedModel):
+    module_class = FlaxRobertaPreLayerNormModule
+
+
+append_call_sample_docstring(
+    FlaxRobertaPreLayerNormModel,
+    _CHECKPOINT_FOR_DOC,
+    FlaxBaseModelOutputWithPooling,
+    _CONFIG_FOR_DOC,
+)
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForMaskedLMModule with Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
+class FlaxRobertaPreLayerNormForMaskedLMModule(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule(
+            config=self.config,
+            add_pooling_layer=False,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.lm_head = FlaxRobertaPreLayerNormLMHead(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta_prelayernorm(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.roberta_prelayernorm.variables["params"]["embeddings"]["word_embeddings"][
+                "embedding"
+            ]
+        else:
+            shared_embedding = None
+
+        # Compute the prediction scores
+        logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxMaskedLMOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """RoBERTa-PreLayerNorm Model with a `language modeling` head on top.""", ROBERTA_PRELAYERNORM_START_DOCSTRING
+)
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForMaskedLM with Roberta->RobertaPreLayerNorm
+class FlaxRobertaPreLayerNormForMaskedLM(FlaxRobertaPreLayerNormPreTrainedModel):
+    module_class = FlaxRobertaPreLayerNormForMaskedLMModule
+
+
+append_call_sample_docstring(
+    FlaxRobertaPreLayerNormForMaskedLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxBaseModelOutputWithPooling,
+    _CONFIG_FOR_DOC,
+    mask="<mask>",
+)
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForSequenceClassificationModule with Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
+class FlaxRobertaPreLayerNormForSequenceClassificationModule(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule(
+            config=self.config,
+            dtype=self.dtype,
+            add_pooling_layer=False,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.classifier = FlaxRobertaPreLayerNormClassificationHead(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta_prelayernorm(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output, deterministic=deterministic)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    RobertaPreLayerNorm Model transformer with a sequence classification/regression head on top (a linear layer on top
+    of the pooled output) e.g. for GLUE tasks.
+    """,
+    ROBERTA_PRELAYERNORM_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForSequenceClassification with Roberta->RobertaPreLayerNorm
+class FlaxRobertaPreLayerNormForSequenceClassification(FlaxRobertaPreLayerNormPreTrainedModel):
+    module_class = FlaxRobertaPreLayerNormForSequenceClassificationModule
+
+
+append_call_sample_docstring(
+    FlaxRobertaPreLayerNormForSequenceClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxSequenceClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMultipleChoiceModule with Bert->RobertaPreLayerNorm, with self.bert->self.roberta_prelayernorm
+class FlaxRobertaPreLayerNormForMultipleChoiceModule(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.classifier = nn.Dense(1, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        num_choices = input_ids.shape[1]
+        input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
+        attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
+        token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
+        position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None
+
+        # Model
+        outputs = self.roberta_prelayernorm(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
+        logits = self.classifier(pooled_output)
+
+        reshaped_logits = logits.reshape(-1, num_choices)
+
+        if not return_dict:
+            return (reshaped_logits,) + outputs[2:]
+
+        return FlaxMultipleChoiceModelOutput(
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    RobertaPreLayerNorm Model with a multiple choice classification head on top (a linear layer on top of the pooled
+    output and a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    ROBERTA_PRELAYERNORM_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForMultipleChoice with Roberta->RobertaPreLayerNorm
+class FlaxRobertaPreLayerNormForMultipleChoice(FlaxRobertaPreLayerNormPreTrainedModel):
+    module_class = FlaxRobertaPreLayerNormForMultipleChoiceModule
+
+
+overwrite_call_docstring(
+    FlaxRobertaPreLayerNormForMultipleChoice,
+    ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"),
+)
+append_call_sample_docstring(
+    FlaxRobertaPreLayerNormForMultipleChoice,
+    _CHECKPOINT_FOR_DOC,
+    FlaxMultipleChoiceModelOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassificationModule with Bert->RobertaPreLayerNorm, with self.bert->self.roberta_prelayernorm
+class FlaxRobertaPreLayerNormForTokenClassificationModule(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule(
+            config=self.config,
+            dtype=self.dtype,
+            add_pooling_layer=False,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        classifier_dropout = (
+            self.config.classifier_dropout
+            if self.config.classifier_dropout is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(rate=classifier_dropout)
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta_prelayernorm(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        logits = self.classifier(hidden_states)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxTokenClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    RobertaPreLayerNorm Model with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    ROBERTA_PRELAYERNORM_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForTokenClassification with Roberta->RobertaPreLayerNorm
+class FlaxRobertaPreLayerNormForTokenClassification(FlaxRobertaPreLayerNormPreTrainedModel):
+    module_class = FlaxRobertaPreLayerNormForTokenClassificationModule
+
+
+append_call_sample_docstring(
+    FlaxRobertaPreLayerNormForTokenClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxTokenClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForQuestionAnsweringModule with Bert->RobertaPreLayerNorm, with self.bert->self.roberta_prelayernorm
+class FlaxRobertaPreLayerNormForQuestionAnsweringModule(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule(
+            config=self.config,
+            dtype=self.dtype,
+            add_pooling_layer=False,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta_prelayernorm(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        logits = self.qa_outputs(hidden_states)
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        if not return_dict:
+            return (start_logits, end_logits) + outputs[1:]
+
+        return FlaxQuestionAnsweringModelOutput(
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    RobertaPreLayerNorm Model with a span classification head on top for extractive question-answering tasks like SQuAD
+    (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    ROBERTA_PRELAYERNORM_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForQuestionAnswering with Roberta->RobertaPreLayerNorm
+class FlaxRobertaPreLayerNormForQuestionAnswering(FlaxRobertaPreLayerNormPreTrainedModel):
+    module_class = FlaxRobertaPreLayerNormForQuestionAnsweringModule
+
+
+append_call_sample_docstring(
+    FlaxRobertaPreLayerNormForQuestionAnswering,
+    _CHECKPOINT_FOR_DOC,
+    FlaxQuestionAnsweringModelOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForCausalLMModule with Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
+class FlaxRobertaPreLayerNormForCausalLMModule(nn.Module):
+    config: RobertaPreLayerNormConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.roberta_prelayernorm = FlaxRobertaPreLayerNormModule(
+            config=self.config,
+            add_pooling_layer=False,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.lm_head = FlaxRobertaPreLayerNormLMHead(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        token_type_ids: Optional[jnp.ndarray] = None,
+        head_mask: Optional[jnp.ndarray] = None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta_prelayernorm(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.roberta_prelayernorm.variables["params"]["embeddings"]["word_embeddings"][
+                "embedding"
+            ]
+        else:
+            shared_embedding = None
+
+        # Compute the prediction scores
+        logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxCausalLMOutputWithCrossAttentions(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    RobertaPreLayerNorm Model with a language modeling head on top (a linear layer on top of the hidden-states output)
+    e.g for autoregressive tasks.
+    """,
+    ROBERTA_PRELAYERNORM_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_flax_roberta.FlaxRobertaForCausalLM with Roberta->RobertaPreLayerNorm
+class FlaxRobertaPreLayerNormForCausalLM(FlaxRobertaPreLayerNormPreTrainedModel):
+    module_class = FlaxRobertaPreLayerNormForCausalLMModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyway.
+        # Thus, we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+append_call_sample_docstring(
+    FlaxRobertaPreLayerNormForCausalLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutputWithCrossAttentions,
+    _CONFIG_FOR_DOC,
+)
+
+
+__all__ = [
+    "FlaxRobertaPreLayerNormForCausalLM",
+    "FlaxRobertaPreLayerNormForMaskedLM",
+    "FlaxRobertaPreLayerNormForMultipleChoice",
+    "FlaxRobertaPreLayerNormForQuestionAnswering",
+    "FlaxRobertaPreLayerNormForSequenceClassification",
+    "FlaxRobertaPreLayerNormForTokenClassification",
+    "FlaxRobertaPreLayerNormModel",
+    "FlaxRobertaPreLayerNormPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..81481574b01eacae6f6f5eb0fa25c64ac287c9c1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py
@@ -0,0 +1,1442 @@
+# coding=utf-8
+# Copyright 2022 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch RoBERTa-PreLayerNorm model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN, gelu
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import auto_docstring, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_roberta_prelayernorm import RobertaPreLayerNormConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->RobertaPreLayerNorm
+class RobertaPreLayerNormEmbeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->RobertaPreLayerNorm
+class RobertaPreLayerNormSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+        self.layer_idx = layer_idx
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = self.query(hidden_states)
+        query_layer = query_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
+            1, 2
+        )
+
+        is_updated = False
+        is_cross_attention = encoder_hidden_states is not None
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_layer from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_layer = curr_past_key_value.layers[self.layer_idx].keys
+            value_layer = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_layer = self.key(current_states)
+            key_layer = key_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose(
+                1, 2
+            )
+            value_layer = self.value(current_states)
+            value_layer = value_layer.view(
+                batch_size, -1, self.num_attention_heads, self.attention_head_size
+            ).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_layer to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_layer, value_layer = curr_past_key_value.update(
+                    key_layer, value_layer, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if past_key_values is not None:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in RobertaPreLayerNormModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+class RobertaPreLayerNormSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+
+
+class RobertaPreLayerNormAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__()
+        self.self = RobertaPreLayerNormSelfAttention(
+            config, position_embedding_type=position_embedding_type, layer_idx=layer_idx
+        )
+        self.output = RobertaPreLayerNormSelfOutput(config)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pruned_heads = set()
+
+    # Copied from transformers.models.bert.modeling_bert.BertAttention.prune_heads
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        hidden_states_pre_layer_norm = self.LayerNorm(hidden_states)
+        self_outputs = self.self(
+            hidden_states_pre_layer_norm,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            past_key_values,
+            output_attentions,
+            cache_position,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class RobertaPreLayerNormIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.LayerNorm(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class RobertaPreLayerNormOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->RobertaPreLayerNorm
+class RobertaPreLayerNormLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = RobertaPreLayerNormAttention(config, layer_idx=layer_idx)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = RobertaPreLayerNormAttention(
+                config, position_embedding_type="absolute", layer_idx=layer_idx
+            )
+        self.intermediate = RobertaPreLayerNormIntermediate(config)
+        self.output = RobertaPreLayerNormOutput(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask=encoder_attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->RobertaPreLayerNorm
+class RobertaPreLayerNormEncoder(nn.Module):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [RobertaPreLayerNormLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and self.config.is_decoder and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+
+        if use_cache and self.config.is_decoder and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    past_key_values,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class RobertaPreLayerNormPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@auto_docstring
+class RobertaPreLayerNormPreTrainedModel(PreTrainedModel):
+    config: RobertaPreLayerNormConfig
+    base_model_prefix = "roberta_prelayernorm"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["RobertaPreLayerNormEmbeddings", "RobertaPreLayerNormSelfAttention"]
+
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->RobertaPreLayerNormLMHead
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, RobertaPreLayerNormLMHead):
+            module.bias.data.zero_()
+
+
+@auto_docstring(
+    custom_intro="""
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
+    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
+    """
+)
+class RobertaPreLayerNormModel(RobertaPreLayerNormPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = RobertaPreLayerNormEmbeddings(config)
+        self.encoder = RobertaPreLayerNormEncoder(config)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.pooler = RobertaPreLayerNormPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = (
+                past_key_values[0][0].shape[-2]
+                if not isinstance(past_key_values, Cache)
+                else past_key_values.get_seq_length()
+            )
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.LayerNorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    RoBERTa-PreLayerNorm Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+)
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with FacebookAI/roberta-base->andreasmadsen/efficient_mlm_m0.40,ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm, RobertaPreLayerNormTokenizer->RobertaTokenizer
+class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning(
+                "If you want to use `RobertaPreLayerNormLMHeadModel` as a standalone, add `is_decoder=True.`"
+            )
+
+        self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
+        self.lm_head = RobertaPreLayerNormLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, RobertaPreLayerNormForCausalLM, AutoConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("andreasmadsen/efficient_mlm_m0.40")
+        >>> config = AutoConfig.from_pretrained("andreasmadsen/efficient_mlm_m0.40")
+        >>> config.is_decoder = True
+        >>> model = RobertaPreLayerNormForCausalLM.from_pretrained("andreasmadsen/efficient_mlm_m0.40", config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.roberta_prelayernorm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
+            lm_loss = self.loss_function(
+                prediction_scores,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    RoBERTa-PreLayerNorm Model with a `language modeling` head on top.
+    """
+)
+class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel):
+    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
+
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `RobertaPreLayerNormForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
+        self.lm_head = RobertaPreLayerNormLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @auto_docstring
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.forward with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta_prelayernorm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(prediction_scores.device)
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->RobertaPreLayerNorm
+class RobertaPreLayerNormLMHead(nn.Module):
+    """RobertaPreLayerNorm Head for masked language modeling."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.decoder.bias = self.bias
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x)
+
+        return x
+
+    def _tie_weights(self):
+        # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+        # For accelerate compatibility and to not break backward compatibility
+        if self.decoder.bias.device.type == "meta":
+            self.decoder.bias = self.bias
+        else:
+            self.bias = self.decoder.bias
+
+
+@auto_docstring(
+    custom_intro="""
+    RoBERTa-PreLayerNorm Model transformer with a sequence classification/regression head on top (a linear layer on top
+    of the pooled output) e.g. for GLUE tasks.
+    """
+)
+class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
+        self.classifier = RobertaPreLayerNormClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.forward with roberta->roberta_prelayernorm
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta_prelayernorm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+# Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
+class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.roberta_prelayernorm = RobertaPreLayerNormModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.roberta_prelayernorm(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(reshaped_logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class RobertaPreLayerNormForTokenClassification(RobertaPreLayerNormPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.forward with roberta->roberta_prelayernorm
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta_prelayernorm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->RobertaPreLayerNorm
+class RobertaPreLayerNormClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@auto_docstring
+class RobertaPreLayerNormForQuestionAnswering(RobertaPreLayerNormPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.forward with roberta->roberta_prelayernorm
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta_prelayernorm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+__all__ = [
+    "RobertaPreLayerNormForCausalLM",
+    "RobertaPreLayerNormForMaskedLM",
+    "RobertaPreLayerNormForMultipleChoice",
+    "RobertaPreLayerNormForQuestionAnswering",
+    "RobertaPreLayerNormForSequenceClassification",
+    "RobertaPreLayerNormForTokenClassification",
+    "RobertaPreLayerNormModel",
+    "RobertaPreLayerNormPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a370f390269240fb02d55f493fbf68905f99977
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/modeling_tf_roberta_prelayernorm.py
@@ -0,0 +1,1807 @@
+# coding=utf-8
+# Copyright 2022 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 RoBERTa-PreLayerNorm model."""
+
+from __future__ import annotations
+
+import math
+import warnings
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithPastAndCrossAttentions,
+    TFBaseModelOutputWithPoolingAndCrossAttentions,
+    TFCausalLMOutputWithCrossAttentions,
+    TFMaskedLMOutput,
+    TFMultipleChoiceModelOutput,
+    TFQuestionAnsweringModelOutput,
+    TFSequenceClassifierOutput,
+    TFTokenClassifierOutput,
+)
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFMaskedLanguageModelingLoss,
+    TFModelInputType,
+    TFMultipleChoiceLoss,
+    TFPreTrainedModel,
+    TFQuestionAnsweringLoss,
+    TFSequenceClassificationLoss,
+    TFTokenClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_roberta_prelayernorm import RobertaPreLayerNormConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "andreasmadsen/efficient_mlm_m0.40"
+_CONFIG_FOR_DOC = "RobertaPreLayerNormConfig"
+
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings with Roberta->RobertaPreLayerNorm
+class TFRobertaPreLayerNormEmbeddings(keras.layers.Layer):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.padding_idx = 1
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.max_position_embeddings = config.max_position_embeddings
+        self.initializer_range = config.initializer_range
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+    def build(self, input_shape=None):
+        with tf.name_scope("word_embeddings"):
+            self.weight = self.add_weight(
+                name="weight",
+                shape=[self.config.vocab_size, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("token_type_embeddings"):
+            self.token_type_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.config.type_vocab_size, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        with tf.name_scope("position_embeddings"):
+            self.position_embeddings = self.add_weight(
+                name="embeddings",
+                shape=[self.max_position_embeddings, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+    def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0):
+        """
+        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
+        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
+
+        Args:
+            input_ids: tf.Tensor
+        Returns: tf.Tensor
+        """
+        mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype)
+        incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask
+
+        return incremental_indices + self.padding_idx
+
+    def call(
+        self,
+        input_ids=None,
+        position_ids=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        past_key_values_length=0,
+        training=False,
+    ):
+        """
+        Applies embedding based on inputs tensor.
+
+        Returns:
+            final_embeddings (`tf.Tensor`): output embedding tensor.
+        """
+        assert not (input_ids is None and inputs_embeds is None)
+
+        if input_ids is not None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+        input_shape = shape_list(inputs_embeds)[:-1]
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = self.create_position_ids_from_input_ids(
+                    input_ids=input_ids, past_key_values_length=past_key_values_length
+                )
+            else:
+                position_ids = tf.expand_dims(
+                    tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0
+                )
+
+        position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids)
+        token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids)
+        final_embeddings = inputs_embeds + position_embeds + token_type_embeds
+        final_embeddings = self.LayerNorm(inputs=final_embeddings)
+        final_embeddings = self.dropout(inputs=final_embeddings, training=training)
+
+        return final_embeddings
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->RobertaPreLayerNorm
+class TFRobertaPreLayerNormPooler(keras.layers.Layer):
+    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(inputs=first_token_tensor)
+
+        return pooled_output
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->RobertaPreLayerNorm
+class TFRobertaPreLayerNormSelfAttention(keras.layers.Layer):
+    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number "
+                f"of attention heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+        self.query = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = keras.layers.Dense(
+            units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
+
+        self.is_decoder = config.is_decoder
+        self.config = config
+
+    def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor,
+        encoder_attention_mask: tf.Tensor,
+        past_key_value: tuple[tf.Tensor],
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        batch_size = shape_list(hidden_states)[0]
+        mixed_query_layer = self.query(inputs=hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size)
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
+            key_layer = tf.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = tf.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
+            value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        # (batch size, num_heads, seq_len_q, seq_len_k)
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, dk)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in TFRobertaPreLayerNormModel call() function)
+            attention_scores = tf.add(attention_scores, attention_mask)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(inputs=attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = tf.multiply(attention_probs, head_mask)
+
+        attention_output = tf.matmul(attention_probs, value_layer)
+        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
+
+        # (batch_size, seq_len_q, all_head_size)
+        attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
+        outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.config.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.config.hidden_size])
+
+
+class TFRobertaPreLayerNormSelfOutput(keras.layers.Layer):
+    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+class TFRobertaPreLayerNormAttention(keras.layers.Layer):
+    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.self_attention = TFRobertaPreLayerNormSelfAttention(config, name="self")
+        self.dense_output = TFRobertaPreLayerNormSelfOutput(config, name="output")
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.config = config
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention.prune_heads
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(
+        self,
+        input_tensor: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor,
+        encoder_attention_mask: tf.Tensor,
+        past_key_value: tuple[tf.Tensor],
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        hidden_states_pre_layer_norm = self.LayerNorm(inputs=input_tensor)
+        self_outputs = self.self_attention(
+            hidden_states=hidden_states_pre_layer_norm,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = self.dense_output(
+            hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
+        )
+        # add attentions (possibly with past_key_value) if we output them
+        outputs = (attention_output,) + self_outputs[1:]
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attention", None) is not None:
+            with tf.name_scope(self.self_attention.name):
+                self.self_attention.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+class TFRobertaPreLayerNormIntermediate(keras.layers.Layer):
+    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dense = keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.LayerNorm(inputs=hidden_states)
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+class TFRobertaPreLayerNormOutput(keras.layers.Layer):
+    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->RobertaPreLayerNorm
+class TFRobertaPreLayerNormLayer(keras.layers.Layer):
+    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.attention = TFRobertaPreLayerNormAttention(config, name="attention")
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = TFRobertaPreLayerNormAttention(config, name="crossattention")
+        self.intermediate = TFRobertaPreLayerNormIntermediate(config, name="intermediate")
+        self.bert_output = TFRobertaPreLayerNormOutput(config, name="output")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_value: tuple[tf.Tensor] | None,
+        output_attentions: bool,
+        training: bool = False,
+    ) -> tuple[tf.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            input_tensor=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_value=self_attn_past_key_value,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                input_tensor=attention_output,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        intermediate_output = self.intermediate(hidden_states=attention_output)
+        layer_output = self.bert_output(
+            hidden_states=intermediate_output, input_tensor=attention_output, training=training
+        )
+        outputs = (layer_output,) + outputs  # add attentions if we output them
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "bert_output", None) is not None:
+            with tf.name_scope(self.bert_output.name):
+                self.bert_output.build(None)
+        if getattr(self, "crossattention", None) is not None:
+            with tf.name_scope(self.crossattention.name):
+                self.crossattention.build(None)
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->RobertaPreLayerNorm
+class TFRobertaPreLayerNormEncoder(keras.layers.Layer):
+    def __init__(self, config: RobertaPreLayerNormConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.layer = [TFRobertaPreLayerNormLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        head_mask: tf.Tensor,
+        encoder_hidden_states: tf.Tensor | None,
+        encoder_attention_mask: tf.Tensor | None,
+        past_key_values: tuple[tuple[tf.Tensor]] | None,
+        use_cache: bool | None,
+        output_attentions: bool,
+        output_hidden_states: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPastAndCrossAttentions | tuple[tf.Tensor]:
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask[i],
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention and encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
+            )
+
+        return TFBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+class TFRobertaPreLayerNormMainLayer(keras.layers.Layer):
+    config_class = RobertaPreLayerNormConfig
+
+    def __init__(self, config, add_pooling_layer=True, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.is_decoder = config.is_decoder
+
+        self.num_hidden_layers = config.num_hidden_layers
+        self.initializer_range = config.initializer_range
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.return_dict = config.use_return_dict
+        self.encoder = TFRobertaPreLayerNormEncoder(config, name="encoder")
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.pooler = TFRobertaPreLayerNormPooler(config, name="pooler") if add_pooling_layer else None
+        # The embeddings must be the last declaration in order to follow the weights order
+        self.embeddings = TFRobertaPreLayerNormEmbeddings(config, name="embeddings")
+
+    def get_input_embeddings(self) -> keras.layers.Layer:
+        return self.embeddings
+
+    def set_input_embeddings(self, value: tf.Variable):
+        self.embeddings.weight = value
+        self.embeddings.vocab_size = shape_list(value)[0]
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFBaseModelOutputWithPoolingAndCrossAttentions | tuple[tf.Tensor]:
+        if not self.config.is_decoder:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+
+        if past_key_values is None:
+            past_key_values_length = 0
+            past_key_values = [None] * len(self.encoder.layer)
+        else:
+            past_key_values_length = shape_list(past_key_values[0][0])[-2]
+
+        if attention_mask is None:
+            attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1)
+
+        if token_type_ids is None:
+            token_type_ids = tf.fill(dims=input_shape, value=0)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+            training=training,
+        )
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        attention_mask_shape = shape_list(attention_mask)
+
+        mask_seq_length = seq_length + past_key_values_length
+        # Provided a padding mask of dimensions [batch_size, mask_seq_length]
+        # - if the model is a decoder, apply a causal mask in addition to the padding mask
+        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+        if self.is_decoder:
+            seq_ids = tf.range(mask_seq_length)
+            causal_mask = tf.less_equal(
+                tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)),
+                seq_ids[None, :, None],
+            )
+            causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype)
+            extended_attention_mask = causal_mask * attention_mask[:, None, :]
+            attention_mask_shape = shape_list(extended_attention_mask)
+            extended_attention_mask = tf.reshape(
+                extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2])
+            )
+            if past_key_values[0] is not None:
+                # attention_mask needs to be sliced to the shape `[batch_size, 1, from_seq_length - cached_seq_length, to_seq_length]
+                extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :]
+        else:
+            extended_attention_mask = tf.reshape(
+                attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1])
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype)
+        one_cst = tf.constant(1.0, dtype=embedding_output.dtype)
+        ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype)
+        extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
+
+        if self.is_decoder and encoder_attention_mask is not None:
+            # If a 2D ou 3D attention mask is provided for the cross-attention
+            # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length]
+            # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=extended_attention_mask.dtype)
+            num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
+            if num_dims_encoder_attention_mask == 3:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+            if num_dims_encoder_attention_mask == 2:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+
+            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+            # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
+            #                                         tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
+
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.LayerNorm(inputs=sequence_output)
+        pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (
+                sequence_output,
+                pooled_output,
+            ) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build(None)
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaPreTrainedModel with Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
+class TFRobertaPreLayerNormPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RobertaPreLayerNormConfig
+    base_model_prefix = "roberta_prelayernorm"
+
+
+ROBERTA_PRELAYERNORM_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`RobertaPreLayerNormConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare RoBERTa-PreLayerNorm Model transformer outputting raw hidden-states without any specific head on top.",
+    ROBERTA_PRELAYERNORM_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaModel with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
+class TFRobertaPreLayerNormModel(TFRobertaPreLayerNormPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(config, name="roberta_prelayernorm")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> tuple | TFBaseModelOutputWithPoolingAndCrossAttentions:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`). Set to `False` during training, `True` during generation
+        """
+        outputs = self.roberta_prelayernorm(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta_prelayernorm", None) is not None:
+            with tf.name_scope(self.roberta_prelayernorm.name):
+                self.roberta_prelayernorm.build(None)
+
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->RobertaPreLayerNorm
+class TFRobertaPreLayerNormLMHead(keras.layers.Layer):
+    """RobertaPreLayerNorm Head for masked language modeling."""
+
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.dense = keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm")
+        self.act = get_tf_activation("gelu")
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = input_embeddings
+
+    def build(self, input_shape=None):
+        self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias")
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.hidden_size])
+
+    def get_output_embeddings(self):
+        return self.decoder
+
+    def set_output_embeddings(self, value):
+        self.decoder.weight = value
+        self.decoder.vocab_size = shape_list(value)[0]
+
+    def get_bias(self):
+        return {"bias": self.bias}
+
+    def set_bias(self, value):
+        self.bias = value["bias"]
+        self.config.vocab_size = shape_list(value["bias"])[0]
+
+    def call(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+
+        # project back to size of vocabulary with bias
+        seq_length = shape_list(tensor=hidden_states)[1]
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
+        hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True)
+        hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size])
+        hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
+
+        return hidden_states
+
+
+@add_start_docstrings(
+    """RoBERTa-PreLayerNorm Model with a `language modeling` head on top.""", ROBERTA_PRELAYERNORM_START_DOCSTRING
+)
+class TFRobertaPreLayerNormForMaskedLM(TFRobertaPreLayerNormPreTrainedModel, TFMaskedLanguageModelingLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]
+
+    # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM.__init__ with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(
+            config, add_pooling_layer=False, name="roberta_prelayernorm"
+        )
+        self.lm_head = TFRobertaPreLayerNormLMHead(config, self.roberta_prelayernorm.embeddings, name="lm_head")
+
+    def get_lm_head(self):
+        return self.lm_head
+
+    def get_prefix_bias_name(self):
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.lm_head.name
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<mask>",
+        expected_output="' Paris'",
+        expected_loss=0.69,
+    )
+    # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM.call with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFMaskedLMOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        outputs = self.roberta_prelayernorm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMaskedLMOutput(
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta_prelayernorm", None) is not None:
+            with tf.name_scope(self.roberta_prelayernorm.name):
+                self.roberta_prelayernorm.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build(None)
+
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
+class TFRobertaPreLayerNormForCausalLM(TFRobertaPreLayerNormPreTrainedModel, TFCausalLanguageModelingLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"]
+
+    def __init__(self, config: RobertaPreLayerNormConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        if not config.is_decoder:
+            logger.warning(
+                "If you want to use `TFRobertaPreLayerNormLMHeadModel` as a standalone, add `is_decoder=True.`"
+            )
+
+        self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(
+            config, add_pooling_layer=False, name="roberta_prelayernorm"
+        )
+        self.lm_head = TFRobertaPreLayerNormLMHead(
+            config, input_embeddings=self.roberta_prelayernorm.embeddings, name="lm_head"
+        )
+
+    def get_lm_head(self):
+        return self.lm_head
+
+    def get_prefix_bias_name(self):
+        warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
+        return self.name + "/" + self.lm_head.name
+
+    # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = tf.ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFCausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        encoder_hidden_states: np.ndarray | tf.Tensor | None = None,
+        encoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFCausalLMOutputWithCrossAttentions | tuple[tf.Tensor]:
+        r"""
+        encoder_hidden_states  (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`). Set to `False` during training, `True` during generation
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        outputs = self.roberta_prelayernorm(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.lm_head(hidden_states=sequence_output, training=training)
+        loss = None
+
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            shifted_logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.hf_compute_loss(labels=labels, logits=shifted_logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFCausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta_prelayernorm", None) is not None:
+            with tf.name_scope(self.roberta_prelayernorm.name):
+                self.roberta_prelayernorm.build(None)
+        if getattr(self, "lm_head", None) is not None:
+            with tf.name_scope(self.lm_head.name):
+                self.lm_head.build(None)
+
+
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead with Roberta->RobertaPreLayerNorm
+class TFRobertaPreLayerNormClassificationHead(keras.layers.Layer):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = keras.layers.Dropout(classifier_dropout)
+        self.out_proj = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj"
+        )
+        self.config = config
+
+    def call(self, features, training=False):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x, training=training)
+        x = self.dense(x)
+        x = self.dropout(x, training=training)
+        x = self.out_proj(x)
+        return x
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    RoBERTa-PreLayerNorm Model transformer with a sequence classification/regression head on top (a linear layer on top
+    of the pooled output) e.g. for GLUE tasks.
+    """,
+    ROBERTA_PRELAYERNORM_START_DOCSTRING,
+)
+class TFRobertaPreLayerNormForSequenceClassification(
+    TFRobertaPreLayerNormPreTrainedModel, TFSequenceClassificationLoss
+):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(
+            config, add_pooling_layer=False, name="roberta_prelayernorm"
+        )
+        self.classifier = TFRobertaPreLayerNormClassificationHead(config, name="classifier")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification.call with roberta->roberta_prelayernorm
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFSequenceClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        outputs = self.roberta_prelayernorm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output, training=training)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta_prelayernorm", None) is not None:
+            with tf.name_scope(self.roberta_prelayernorm.name):
+                self.roberta_prelayernorm.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build(None)
+
+
+@add_start_docstrings(
+    """
+    RobertaPreLayerNorm Model with a multiple choice classification head on top (a linear layer on top of the pooled
+    output and a softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    ROBERTA_PRELAYERNORM_START_DOCSTRING,
+)
+# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMultipleChoice with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm
+class TFRobertaPreLayerNormForMultipleChoice(TFRobertaPreLayerNormPreTrainedModel, TFMultipleChoiceLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"lm_head"]
+    _keys_to_ignore_on_load_missing = [r"dropout"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(config, name="roberta_prelayernorm")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(
+        ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+    )
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFMultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFMultipleChoiceModelOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]`
+            where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above)
+        """
+
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+        outputs = self.roberta_prelayernorm(
+            flat_input_ids,
+            flat_attention_mask,
+            flat_token_type_ids,
+            flat_position_ids,
+            head_mask,
+            inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output, training=training)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+        loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFMultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta_prelayernorm", None) is not None:
+            with tf.name_scope(self.roberta_prelayernorm.name):
+                self.roberta_prelayernorm.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    RoBERTa-PreLayerNorm Model with a token classification head on top (a linear layer on top of the hidden-states
+    output) e.g. for Named-Entity-Recognition (NER) tasks.
+    """,
+    ROBERTA_PRELAYERNORM_START_DOCSTRING,
+)
+class TFRobertaPreLayerNormForTokenClassification(TFRobertaPreLayerNormPreTrainedModel, TFTokenClassificationLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
+    _keys_to_ignore_on_load_missing = [r"dropout"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(
+            config, add_pooling_layer=False, name="roberta_prelayernorm"
+        )
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = keras.layers.Dropout(classifier_dropout)
+        self.classifier = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFTokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForTokenClassification.call with roberta->roberta_prelayernorm
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFTokenClassifierOutput | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        outputs = self.roberta_prelayernorm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=training)
+        logits = self.classifier(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta_prelayernorm", None) is not None:
+            with tf.name_scope(self.roberta_prelayernorm.name):
+                self.roberta_prelayernorm.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_size])
+
+
+@add_start_docstrings(
+    """
+    RoBERTa-PreLayerNorm Model with a span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    ROBERTA_PRELAYERNORM_START_DOCSTRING,
+)
+class TFRobertaPreLayerNormForQuestionAnswering(TFRobertaPreLayerNormPreTrainedModel, TFQuestionAnsweringLoss):
+    # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.roberta_prelayernorm = TFRobertaPreLayerNormMainLayer(
+            config, add_pooling_layer=False, name="roberta_prelayernorm"
+        )
+        self.qa_outputs = keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    # Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForQuestionAnswering.call with roberta->roberta_prelayernorm
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        start_positions: np.ndarray | tf.Tensor | None = None,
+        end_positions: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> TFQuestionAnsweringModelOutput | tuple[tf.Tensor]:
+        r"""
+        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        outputs = self.roberta_prelayernorm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            labels = {"start_position": start_positions}
+            labels["end_position"] = end_positions
+            loss = self.hf_compute_loss(labels, (start_logits, end_logits))
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFQuestionAnsweringModelOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "roberta_prelayernorm", None) is not None:
+            with tf.name_scope(self.roberta_prelayernorm.name):
+                self.roberta_prelayernorm.build(None)
+        if getattr(self, "qa_outputs", None) is not None:
+            with tf.name_scope(self.qa_outputs.name):
+                self.qa_outputs.build([None, None, self.config.hidden_size])
+
+
+__all__ = [
+    "TFRobertaPreLayerNormForCausalLM",
+    "TFRobertaPreLayerNormForMaskedLM",
+    "TFRobertaPreLayerNormForMultipleChoice",
+    "TFRobertaPreLayerNormForQuestionAnswering",
+    "TFRobertaPreLayerNormForSequenceClassification",
+    "TFRobertaPreLayerNormForTokenClassification",
+    "TFRobertaPreLayerNormMainLayer",
+    "TFRobertaPreLayerNormModel",
+    "TFRobertaPreLayerNormPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec918f62dae19e228e310eb0c30c13d37b0bae8f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/configuration_sam2_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/configuration_sam2_video.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13c9528a4fc83e4537767cb09aa58791fa90b785
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/configuration_sam2_video.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/processing_sam2_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/processing_sam2_video.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b0ea6d138a906b841323c0b0cad5688ec9e3af2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/processing_sam2_video.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/video_processing_sam2_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/video_processing_sam2_video.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed7f4e0418ab53b3f63a9dc7ad3d41f8638c0ce6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam2_video/__pycache__/video_processing_sam2_video.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/configuration_sam_hq.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/configuration_sam_hq.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a206f3305dc7d8543ba9a3dfc5cca37f9d83f6e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/configuration_sam_hq.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..512ed1490b0a6d6e19d3673b8eeaf57f246519be
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__pycache__/configuration_seamless_m4t_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__pycache__/configuration_seamless_m4t_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..edca02532183941fa4adb0576b42c4656a2673cb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__pycache__/configuration_seamless_m4t_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47198204739cda38994e61436895ef5345a42df9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/configuration_sew.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/configuration_sew.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f6cb0378d5259d03d79fcdc9d01717bf8805c14
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/configuration_sew.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/feature_extraction_sew.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/feature_extraction_sew.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a15abb306adc8fc36dcf1c05f67be38befc3582
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/feature_extraction_sew.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/modeling_sew.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/modeling_sew.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85a81f7f9701e4cab8dab19c89c561a43cfbff46
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/modeling_sew.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/modular_sew.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/modular_sew.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b52e5272793148974e55be67c0bf30dfb25bbf0c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sew/__pycache__/modular_sew.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d667f0404799ad7ccc0e91cb89118ae54fca31eb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/configuration_siglip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/configuration_siglip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3da07d430abaeb67fb0120cfe5d706a14ba3027e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/configuration_siglip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/image_processing_siglip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/image_processing_siglip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77a9052b4992de31360e0ca53e6fb2e3e55e4cfc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/image_processing_siglip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/image_processing_siglip_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/image_processing_siglip_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96cc331a80be6df9da4531955bed1d3851241515
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/image_processing_siglip_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/modeling_siglip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/modeling_siglip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4c813690e4f1869248ffd42a892cb160ef4fb35
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/modeling_siglip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/processing_siglip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/processing_siglip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40648dfa6de37cfcb8c63162f1de416961eba882
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/processing_siglip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/tokenization_siglip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/tokenization_siglip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9724b871ead71c72dd27b4287c8b8f1b62e11783
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip/__pycache__/tokenization_siglip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22f337bbee2b7e6a800a7248fc7ae961c07e0f07
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/configuration_siglip2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/configuration_siglip2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee74d6efdbe268b29fbbb0f08a77896c1377e965
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/configuration_siglip2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/image_processing_siglip2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/image_processing_siglip2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f89565586d03af4b843cadfa37625bd3ac463194
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/image_processing_siglip2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/image_processing_siglip2_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/image_processing_siglip2_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d61e62083ba50c96f0567e9e151726748ce8d40
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/image_processing_siglip2_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/modeling_siglip2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/modeling_siglip2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fcce5899717c5a6a442e2d1b28f1cf32c76ce53d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/modeling_siglip2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/modular_siglip2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/modular_siglip2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6bc50d8e6d0b65cfc38a2b0a2e7f21d0fd050df8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/modular_siglip2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/processing_siglip2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/processing_siglip2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f81f25bfed698ccfc4246e3e0c6d77e116e086f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/siglip2/__pycache__/processing_siglip2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6bbb950cb6e690a00c3072db255a0da06acf8786
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/configuration_smollm3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/configuration_smollm3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54a88cab43b8147db5e5f46f9956b2dd51d68568
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/configuration_smollm3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/modeling_smollm3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/modeling_smollm3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..daf49be994dd34851c94e82b24e69c025aac8f2e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/modeling_smollm3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/modular_smollm3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/modular_smollm3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0979af46c651400ad9ec7b35896025ecd7359412
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smollm3/__pycache__/modular_smollm3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/image_processing_smolvlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/image_processing_smolvlm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..878deb9f4dd7762d5721717042786aa98e792826
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/image_processing_smolvlm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/image_processing_smolvlm_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/image_processing_smolvlm_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2882fb051a3115098c29c0c63079f12c9aec05d8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/image_processing_smolvlm_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/modeling_smolvlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/modeling_smolvlm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04c9924af19248a034d70cc63e273bc4d9e99106
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/modeling_smolvlm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/modular_smolvlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/modular_smolvlm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bcb99845eab3b56d6b8ec184f20ba6e938eaab15
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/modular_smolvlm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/processing_smolvlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/processing_smolvlm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a38729613236f71f6ab3e3c2da270207e86b6df
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/processing_smolvlm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/video_processing_smolvlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/video_processing_smolvlm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4457f0f1cc58845691242df2ee3c690499eee237
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/video_processing_smolvlm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b41c180e350e5b75b7451ccd5a31e9bed0143684
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/configuration_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/configuration_speech_to_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..abc5632cf4f8f5af8521ed8f4426f332365329b3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/configuration_speech_to_text.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/feature_extraction_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/feature_extraction_speech_to_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..adc8c01a2549a5f52e003dfe7301af79689847ed
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/feature_extraction_speech_to_text.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/modeling_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/modeling_speech_to_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4370d21e3b557b2c1adb3381acfa8f674ca7768
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/modeling_speech_to_text.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/modeling_tf_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/modeling_tf_speech_to_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23422212827afa9aafe28bc86e3fd71d547d8733
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/modeling_tf_speech_to_text.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/processing_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/processing_speech_to_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e8c6b7257bc0ba9344a98129df3e0326f964dd5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/processing_speech_to_text.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/tokenization_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/tokenization_speech_to_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9f8174e27f472b4fbc819c2db83efbde9ce769d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speech_to_text/__pycache__/tokenization_speech_to_text.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6f021c148b3e830dea8f37a11262718a7e68d13
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/configuration_squeezebert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/configuration_squeezebert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d2a187fc43991fe0a73e99943f2ec37e25f7deb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/configuration_squeezebert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/modeling_squeezebert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/modeling_squeezebert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8afbd8395d1036a70eae0e5837acb623b63e60d6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/modeling_squeezebert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a11a3d040bb4624a4bc64699e01818139f690237
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31f4cc783dc73fc9121ee3654331bc20e441d311
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ffc8024094b2dc450ac4fe362cccec99d4aadf7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/configuration_stablelm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/configuration_stablelm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da642d9fc1ff306cfc3343a1ab25a0c4f4a39a87
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/configuration_stablelm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/modeling_stablelm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/modeling_stablelm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95ec9a07f92609c80a0767911fa437dac84fa71d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/stablelm/__pycache__/modeling_stablelm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94a0c13d20db3b34225ea0d1cebde871daa99c2a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/configuration_superglue.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/configuration_superglue.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5bbec9bc7adb265f0af09419d566aa7f90c159a4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/configuration_superglue.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/image_processing_superglue.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/image_processing_superglue.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93127d6a4acc8a6545c2df3d21e38279bf6331ae
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/image_processing_superglue.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/modeling_superglue.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/modeling_superglue.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89ce09c8e1923fda3e4a77f13186e4d0fc08da83
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superglue/__pycache__/modeling_superglue.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cc55f20aebce3262a9420ff8476a98a7cc811b1d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/configuration_superpoint.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/configuration_superpoint.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f48b50a5184066b97ca86799884541521e7ba7cd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/configuration_superpoint.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/image_processing_superpoint.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/image_processing_superpoint.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab15016cd5ea21bc02ee149a65e17a089e03d0d8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/image_processing_superpoint.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/image_processing_superpoint_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/image_processing_superpoint_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6527e2053648bb5b96156c51eec2da8fb68a78f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/image_processing_superpoint_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/modeling_superpoint.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/modeling_superpoint.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7e23f71b6296bf296cb1a9bdbc5b93066803546
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/superpoint/__pycache__/modeling_superpoint.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ee8774ab5db8d3352efcfaa81159ec1247fc9b3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/configuration_t5gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/configuration_t5gemma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1e1bf5383ad515d80eb0e476017159101b33f87
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/configuration_t5gemma.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/modular_t5gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/modular_t5gemma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74ce4f8480e8af2037cb7df1589566e281c20514
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/modular_t5gemma.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7fc4762a062a86bc5c6dbd296171ab495fd1a104
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/configuration_table_transformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/configuration_table_transformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24d4268d39742ffcd826ee0411346c5d64f697ac
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/configuration_table_transformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/modeling_table_transformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/modeling_table_transformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b3145d7a24e193e0d5ad84e4eb79b59a36b9611
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/table_transformer/__pycache__/modeling_table_transformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2092b57cc781fb8332f3dafd139860e3adf5fbba
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/configuration_textnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/configuration_textnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b5ee37ad9e2e44b665c3f7f646d03665b9ff87d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/configuration_textnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/image_processing_textnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/image_processing_textnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b998802c4c60314fe1e7da92a6023ec8c797ef68
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/image_processing_textnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/image_processing_textnet_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/image_processing_textnet_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7bc301f6c8713609500022a624f49c2a70174a92
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/image_processing_textnet_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/modeling_textnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/modeling_textnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37f12f8bb1dfcd09c8b1e28fb5f93146ebd9ac48
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/textnet/__pycache__/modeling_textnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e493c4ccc472251b942d744ec761a510645a573a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/configuration_time_series_transformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/configuration_time_series_transformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6be9ab909f7312344be6b9eb1e1f36211de6ae20
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/configuration_time_series_transformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/modeling_time_series_transformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/modeling_time_series_transformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6035c533cc5d036c99b11c35ed68699c44d7df1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/time_series_transformer/__pycache__/modeling_time_series_transformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db1a9e12457d640f0f0fd4a23934dd9dfedf30b7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/configuration_umt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/configuration_umt5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc2a542642d500de73f6d68704d85e56f91ec146
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/configuration_umt5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/modeling_umt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/modeling_umt5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53b5e40aae9b57767d954f6f1be75c907efc945e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/umt5/__pycache__/modeling_umt5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a03daae25f4535c96c32e20dc4b13a817518bc06
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_unispeech import *
+    from .modeling_unispeech import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/configuration_unispeech.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/configuration_unispeech.py
new file mode 100644
index 0000000000000000000000000000000000000000..a71ba7f1b7d2a57671e8e75f1c773b0665e1614a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/configuration_unispeech.py
@@ -0,0 +1,309 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""UniSpeech model configuration"""
+
+import functools
+import operator
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class UniSpeechConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`UniSpeechModel`]. It is used to instantiate an
+    UniSpeech model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the UniSpeech
+    [microsoft/unispeech-large-1500h-cv](https://huggingface.co/microsoft/unispeech-large-1500h-cv) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32):
+            Vocabulary size of the UniSpeech model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`UniSpeechModel`]. Vocabulary size of the model. Defines the
+            different tokens that can be represented by the *inputs_ids* passed to the forward method of
+            [`UniSpeechModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the output of the feature encoder that's used by the quantizer.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`UniSpeechForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) for more
+            details.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_extract_activation (`str, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        conv_dim (`tuple[int]` or `list[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`tuple[int]` or `list[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`tuple[int]` or `list[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 2, 2)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is
+            True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
+            False` corresponds to applying layer norm after the attention layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://huggingface.co/papers/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2):
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0):
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
+            Number of entries in each quantization codebook (group).
+        num_codevector_groups (`int`, *optional*, defaults to 2):
+            Number of codevector groups for product codevector quantization.
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        num_negatives (`int`, *optional*, defaults to 100):
+            Number of negative samples for the contrastive loss.
+        codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the quantized feature vectors.
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the final projection of both the quantized and the transformer features.
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
+            The weight of the codebook diversity loss component.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`UniSpeechForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`UniSpeechForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`UniSpeechForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification.
+        num_ctc_classes (`int`, *optional*, defaults to 80):
+            Specifies the number of classes (phoneme tokens and blank token) for phoneme-level CTC loss. Only relevant
+            when using an instance of [`UniSpeechForPreTraining`].
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        replace_prob (`float`, *optional*, defaults to 0.5):
+            Probability that transformer feature is replaced by quantized feature for pretraining.
+
+    Example:
+
+    ```python
+    >>> from transformers import UniSpeechConfig, UniSpeechModel
+
+    >>> # Initializing a UniSpeech facebook/unispeech-base-960h style configuration
+    >>> configuration = UniSpeechConfig()
+
+    >>> # Initializing a model (with random weights) from the facebook/unispeech-base-960h style configuration
+    >>> model = UniSpeechModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "unispeech"
+
+    def __init__(
+        self,
+        vocab_size=32,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        activation_dropout=0.1,
+        attention_dropout=0.1,
+        feat_proj_dropout=0.0,
+        feat_quantizer_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        feat_extract_norm="group",
+        feat_extract_activation="gelu",
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        num_conv_pos_embeddings=128,
+        num_conv_pos_embedding_groups=16,
+        do_stable_layer_norm=False,
+        apply_spec_augment=True,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        num_codevectors_per_group=320,
+        num_codevector_groups=2,
+        contrastive_logits_temperature=0.1,
+        num_negatives=100,
+        codevector_dim=256,
+        proj_codevector_dim=256,
+        diversity_loss_weight=0.1,
+        ctc_loss_reduction="mean",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        num_ctc_classes=80,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        replace_prob=0.5,
+        **kwargs,
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.num_ctc_classes = num_ctc_classes
+        self.vocab_size = vocab_size
+        self.do_stable_layer_norm = do_stable_layer_norm
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+        self.classifier_proj_size = classifier_proj_size
+
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` =="
+                " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) ="
+                f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`,"
+                f" `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+
+        # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        # parameters for pretraining with codevector quantized representations
+        self.num_codevectors_per_group = num_codevectors_per_group
+        self.num_codevector_groups = num_codevector_groups
+        self.contrastive_logits_temperature = contrastive_logits_temperature
+        self.feat_quantizer_dropout = feat_quantizer_dropout
+        self.num_negatives = num_negatives
+        self.codevector_dim = codevector_dim
+        self.proj_codevector_dim = proj_codevector_dim
+        self.diversity_loss_weight = diversity_loss_weight
+
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+        # pretraining loss
+        self.replace_prob = replace_prob
+
+    @property
+    def inputs_to_logits_ratio(self):
+        return functools.reduce(operator.mul, self.conv_stride, 1)
+
+
+__all__ = ["UniSpeechConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/modeling_unispeech.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/modeling_unispeech.py
new file mode 100644
index 0000000000000000000000000000000000000000..233653a36b39c58d3edcc984cefec9f8e44961be
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/modeling_unispeech.py
@@ -0,0 +1,1518 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/unispeech/modular_unispeech.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_unispeech.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_attention_mask_for_sdpa
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    ModelOutput,
+    SequenceClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, is_torch_flex_attn_available, logging
+from .configuration_unispeech import UniSpeechConfig
+
+
+if is_torch_flex_attn_available():
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.
+    """
+)
+class UniSpeechForPreTrainingOutput(ModelOutput):
+    r"""
+    loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
+        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+        paper](https://huggingface.co/papers/2006.11477).
+    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+        projected quantized states.
+    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+        target vectors for contrastive loss.
+    codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
+        The perplexity of the codevector distribution, used to measure the diversity of the codebook.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    projected_states: Optional[torch.FloatTensor] = None
+    projected_quantized_states: Optional[torch.FloatTensor] = None
+    codevector_perplexity: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+class UniSpeechSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+class UniSpeechPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+            if hasattr(self.conv, "parametrizations"):
+                weight_g = self.conv.parametrizations.weight.original0
+                weight_v = self.conv.parametrizations.weight.original1
+            else:
+                weight_g = self.conv.weight_g
+                weight_v = self.conv.weight_v
+            deepspeed.zero.register_external_parameter(self, weight_v)
+            deepspeed.zero.register_external_parameter(self, weight_g)
+        else:
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
+
+        self.padding = UniSpeechSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class UniSpeechNoLayerNormConvLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class UniSpeechLayerNormConvLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class UniSpeechGroupNormConvLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class UniSpeechFeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        if config.feat_extract_norm == "group":
+            conv_layers = [UniSpeechGroupNormConvLayer(config, layer_id=0)] + [
+                UniSpeechNoLayerNormConvLayer(config, layer_id=i + 1)
+                for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [
+                UniSpeechLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
+            ]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.ModuleList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
+
+        for conv_layer in self.conv_layers:
+            hidden_states = conv_layer(hidden_states)
+
+        return hidden_states
+
+
+class UniSpeechFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: Optional[float] = None,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    if scaling is None:
+        scaling = query.size(-1) ** -0.5
+
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask.view(1, -1, 1, 1)
+
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class UniSpeechAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[UniSpeechConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        # TODO: we need a refactor so that the different attention modules can get their specific kwargs
+        # ATM, we have mixed things encoder, decoder, and encoder-decoder attn
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        # determine input shapes
+        bsz, tgt_len = hidden_states.shape[:-1]
+        src_len = key_value_states.shape[1] if is_cross_attention else tgt_len
+
+        q_input_shape = (bsz, tgt_len, -1, self.head_dim)
+        kv_input_shape = (bsz, src_len, -1, self.head_dim)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states).view(*q_input_shape).transpose(1, 2)
+
+        current_states = key_value_states if is_cross_attention else hidden_states
+        key_states = self.k_proj(current_states).view(*kv_input_shape).transpose(1, 2)
+        value_states = self.v_proj(current_states).view(*kv_input_shape).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            output_attentions=output_attentions,
+            head_mask=layer_head_mask,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights, None
+
+
+class UniSpeechFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class UniSpeechEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = UniSpeechAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+            config=config,
+        )
+
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = UniSpeechFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class UniSpeechEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = UniSpeechPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([UniSpeechEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+        attention_mask = self._update_full_mask(
+            attention_mask,
+            hidden_states,
+        )
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and dropout_probability < self.config.layerdrop
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                layer_outputs = layer(
+                    hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def _update_full_mask(
+        self,
+        attention_mask: Union[torch.Tensor, None],
+        inputs_embeds: torch.Tensor,
+    ):
+        if attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask if 0 in attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False)
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        return attention_mask
+
+
+class UniSpeechAttnAdapterLayer(nn.Module):
+    def __init__(self, config):
+        """
+        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
+        up training throughput.
+        """
+        super().__init__()
+        self.input_dim = config.adapter_attn_dim
+        self.hidden_dim = config.hidden_size
+
+        self.norm = nn.LayerNorm(self.hidden_dim)
+        self.linear_1 = nn.Linear(self.hidden_dim, self.input_dim)
+        self.act_fn = nn.ReLU()
+        self.linear_2 = nn.Linear(self.input_dim, self.hidden_dim)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        hidden_states = self.norm(hidden_states)
+
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states
+
+
+class UniSpeechEncoderLayerStableLayerNorm(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = UniSpeechAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+            config=config,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = UniSpeechFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if getattr(config, "adapter_attn_dim", None) is not None:
+            self.adapter_layer = UniSpeechAttnAdapterLayer(config)
+        else:
+            self.adapter_layer = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        if self.adapter_layer is not None:
+            hidden_states = hidden_states + self.adapter_layer(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class UniSpeechEncoderStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = UniSpeechPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList(
+            [UniSpeechEncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+        attention_mask = self._update_full_mask(
+            attention_mask,
+            hidden_states,
+        )
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and dropout_probability < self.config.layerdrop
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
+                layer_outputs = layer(
+                    hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+                )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def _update_full_mask(
+        self,
+        attention_mask: Union[torch.Tensor, None],
+        inputs_embeds: torch.Tensor,
+    ):
+        if attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask if 0 in attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False)
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        return attention_mask
+
+
+class UniSpeechGumbelVectorQuantizer(nn.Module):
+    """
+    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_groups = config.num_codevector_groups
+        self.num_vars = config.num_codevectors_per_group
+
+        if config.codevector_dim % self.num_groups != 0:
+            raise ValueError(
+                f"`config.codevector_dim {config.codevector_dim} must be divisible "
+                f"by `config.num_codevector_groups` {self.num_groups} for concatenation"
+            )
+
+        # storage for codebook variables (codewords)
+        self.codevectors = nn.Parameter(
+            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
+        )
+        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
+
+        # can be decayed for training
+        self.temperature = 2
+
+    @staticmethod
+    def _compute_perplexity(probs):
+        marginal_probs = probs.mean(dim=0)
+        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+
+    def forward(self, hidden_states):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(
+                hidden_states.float(), tau=self.temperature, hard=True
+            ).type_as(hidden_states)
+
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+
+            perplexity = self._compute_perplexity(codevector_probs)
+
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+
+        return codevectors, perplexity
+
+
+@auto_docstring
+class UniSpeechPreTrainedModel(PreTrainedModel):
+    config: UniSpeechConfig
+    base_model_prefix = "unispeech"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # gumbel softmax requires special init
+        if isinstance(module, UniSpeechGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, UniSpeechPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, UniSpeechFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths).to(torch.long)
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+def _compute_mask_indices(
+    shape: tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+UniSpeechBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@auto_docstring
+class UniSpeechModel(UniSpeechPreTrainedModel):
+    def __init__(self, config: UniSpeechConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = UniSpeechFeatureEncoder(config)
+        self.feature_projection = UniSpeechFeatureProjection(config)
+
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        if config.do_stable_layer_norm:
+            self.encoder = UniSpeechEncoderStableLayerNorm(config)
+        else:
+            self.encoder = UniSpeechEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://huggingface.co/papers/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+
+        return hidden_states
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, UniSpeechBaseModelOutput]:
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return UniSpeechBaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
+    """
+)
+class UniSpeechForPreTraining(UniSpeechPreTrainedModel):
+    def __init__(self, config: UniSpeechConfig):
+        super().__init__(config)
+        self.unispeech = UniSpeechModel(config)
+        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
+
+        self.quantizer = UniSpeechGumbelVectorQuantizer(config)
+        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
+        self.project_hid = nn.Linear(config.proj_codevector_dim, config.hidden_size)
+
+        self.ctc_proj = nn.Linear(config.hidden_size, config.num_ctc_classes)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def set_gumbel_temperature(self, temperature: int):
+        """
+        Set the Gumbel softmax temperature to a given value. Only necessary for training
+        """
+        self.quantizer.temperature = temperature
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.unispeech.feature_extractor._freeze_parameters()
+
+    @staticmethod
+    def compute_contrastive_logits(
+        target_features: torch.FloatTensor,
+        negative_features: torch.FloatTensor,
+        predicted_features: torch.FloatTensor,
+        temperature: int = 1,
+    ):
+        """
+        Compute logits for contrastive loss based using cosine similarity as the distance measure between
+        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
+        """
+        target_features = torch.cat([target_features, negative_features], dim=0)
+
+        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1)
+        logits = logits.type_as(target_features)
+
+        # apply temperature
+        logits = logits / temperature
+        return logits
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, UniSpeechForPreTrainingOutput]:
+        r"""
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
+        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
+        >>> # TODO: Add full pretraining example
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.unispeech(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        transformer_features = outputs[0]
+
+        # quantize all (unmasked) extracted features and project to final vq dim
+        extract_features = self.dropout_features(outputs[1])
+        quantized_features, codevector_perplexity = self.quantizer(extract_features)
+
+        # project quantized features twice
+        quantized_features = self.project_q(quantized_features.to(self.project_q.weight.dtype))
+        quantized_features = self.project_hid(quantized_features)
+
+        prob_replace_matrix = torch.empty(transformer_features.size(0), transformer_features.size(1)).fill_(
+            self.config.replace_prob
+        )
+        prob_replace_matrix = prob_replace_matrix.transpose(0, 1)
+        sampled_replace_matrix = torch.bernoulli(prob_replace_matrix).bool().to(transformer_features.device)
+        sampled_replace_matrix = sampled_replace_matrix.transpose(0, 1)
+        sampled_replace_matrix = sampled_replace_matrix.unsqueeze(-1)
+        logits = transformer_features.masked_fill(sampled_replace_matrix, 0.0) + (
+            quantized_features.masked_fill(~sampled_replace_matrix, 0.0)
+        )
+
+        # project to ctc units
+        logits = self.dropout(logits)
+        logits = self.ctc_proj(logits)
+
+        # TODO(PVP) - add negative sampling & loss computation
+        loss = None
+        if not return_dict:
+            if loss is not None:
+                return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+            return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+
+        return UniSpeechForPreTrainingOutput(
+            loss=loss,
+            projected_states=transformer_features,
+            projected_quantized_states=quantized_features,
+            codevector_perplexity=codevector_perplexity,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+
+@auto_docstring(
+    custom_intro="""
+    UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
+    """
+)
+class UniSpeechForCTC(UniSpeechPreTrainedModel):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        r"""
+        target_lang (`str`, *optional*):
+            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
+            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechForCTC`] with adapters. Uses 'eng' by
+            default.
+        """
+        super().__init__(config)
+
+        self.unispeech = UniSpeechModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        self.target_lang = target_lang
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `UniSpeechForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for UniSpeech so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, UniSpeech never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang, force_load=True)
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.unispeech.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.unispeech.parameters():
+            param.requires_grad = False
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        outputs = self.unispeech(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
+    """
+)
+class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Sequence classification does not support the use of UniSpeech adapters (config.add_adapter=True)"
+            )
+        self.unispeech = UniSpeechModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.unispeech.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.unispeech.parameters():
+            param.requires_grad = False
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
+            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
+            into a tensor of type `torch.FloatTensor`. See [`UniSpeechProcessor.__call__`] for details.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.unispeech(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "UniSpeechForCTC",
+    "UniSpeechForPreTraining",
+    "UniSpeechForSequenceClassification",
+    "UniSpeechModel",
+    "UniSpeechPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/modular_unispeech.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/modular_unispeech.py
new file mode 100644
index 0000000000000000000000000000000000000000..900079b7bb9b469f562bab3be8ea3605f03bd0f4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/modular_unispeech.py
@@ -0,0 +1,445 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch UniSpeech model."""
+
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...modeling_outputs import ModelOutput, Wav2Vec2BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from ..wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Encoder,
+    Wav2Vec2EncoderStableLayerNorm,
+    Wav2Vec2FeatureEncoder,
+    Wav2Vec2FeatureProjection,
+    Wav2Vec2ForCTC,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2GumbelVectorQuantizer,
+    Wav2Vec2Model,
+    Wav2Vec2PositionalConvEmbedding,
+)
+from .configuration_unispeech import UniSpeechConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.
+    """
+)
+class UniSpeechForPreTrainingOutput(ModelOutput):
+    r"""
+    loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
+        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+        paper](https://huggingface.co/papers/2006.11477).
+    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+        projected quantized states.
+    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+        target vectors for contrastive loss.
+    codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
+        The perplexity of the codevector distribution, used to measure the diversity of the codebook.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    projected_states: Optional[torch.FloatTensor] = None
+    projected_quantized_states: Optional[torch.FloatTensor] = None
+    codevector_perplexity: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+class UniSpeechPositionalConvEmbedding(Wav2Vec2PositionalConvEmbedding):
+    pass
+
+
+class UniSpeechFeatureEncoder(Wav2Vec2FeatureEncoder):
+    pass
+
+
+class UniSpeechFeatureProjection(Wav2Vec2FeatureProjection):
+    pass
+
+
+class UniSpeechEncoder(Wav2Vec2Encoder):
+    pass
+
+
+class UniSpeechEncoderStableLayerNorm(Wav2Vec2EncoderStableLayerNorm):
+    pass
+
+
+class UniSpeechGumbelVectorQuantizer(Wav2Vec2GumbelVectorQuantizer):
+    @staticmethod
+    def _compute_perplexity(probs):
+        marginal_probs = probs.mean(dim=0)
+        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+
+    def forward(self, hidden_states):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(
+                hidden_states.float(), tau=self.temperature, hard=True
+            ).type_as(hidden_states)
+
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+
+            perplexity = self._compute_perplexity(codevector_probs)
+
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+
+        return codevectors, perplexity
+
+
+@auto_docstring
+class UniSpeechPreTrainedModel(PreTrainedModel):
+    config: UniSpeechConfig
+    base_model_prefix = "unispeech"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # gumbel softmax requires special init
+        if isinstance(module, UniSpeechGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, UniSpeechPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, UniSpeechFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: torch.LongTensor):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths).to(torch.long)
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+UniSpeechBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+class UniSpeechModel(UniSpeechPreTrainedModel, Wav2Vec2Model):
+    def __init__(self, config: UniSpeechConfig):
+        UniSpeechPreTrainedModel.__init__(self, config)
+        self.config = config
+        self.feature_extractor = UniSpeechFeatureEncoder(config)
+        self.feature_projection = UniSpeechFeatureProjection(config)
+
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        if config.do_stable_layer_norm:
+            self.encoder = UniSpeechEncoderStableLayerNorm(config)
+        else:
+            self.encoder = UniSpeechEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for UniSpeech")
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for UniSpeech")
+
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, UniSpeechBaseModelOutput]:
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(extract_features.shape[1], attention_mask)
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return UniSpeechBaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
+    """
+)
+class UniSpeechForPreTraining(UniSpeechPreTrainedModel):
+    def __init__(self, config: UniSpeechConfig):
+        super().__init__(config)
+        self.unispeech = UniSpeechModel(config)
+        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
+
+        self.quantizer = UniSpeechGumbelVectorQuantizer(config)
+        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
+        self.project_hid = nn.Linear(config.proj_codevector_dim, config.hidden_size)
+
+        self.ctc_proj = nn.Linear(config.hidden_size, config.num_ctc_classes)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def set_gumbel_temperature(self, temperature: int):
+        """
+        Set the Gumbel softmax temperature to a given value. Only necessary for training
+        """
+        self.quantizer.temperature = temperature
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.unispeech.feature_extractor._freeze_parameters()
+
+    @staticmethod
+    def compute_contrastive_logits(
+        target_features: torch.FloatTensor,
+        negative_features: torch.FloatTensor,
+        predicted_features: torch.FloatTensor,
+        temperature: int = 1,
+    ):
+        """
+        Compute logits for contrastive loss based using cosine similarity as the distance measure between
+        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
+        """
+        target_features = torch.cat([target_features, negative_features], dim=0)
+
+        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1)
+        logits = logits.type_as(target_features)
+
+        # apply temperature
+        logits = logits / temperature
+        return logits
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, UniSpeechForPreTrainingOutput]:
+        r"""
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
+        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
+        >>> # TODO: Add full pretraining example
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.unispeech(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        transformer_features = outputs[0]
+
+        # quantize all (unmasked) extracted features and project to final vq dim
+        extract_features = self.dropout_features(outputs[1])
+        quantized_features, codevector_perplexity = self.quantizer(extract_features)
+
+        # project quantized features twice
+        quantized_features = self.project_q(quantized_features.to(self.project_q.weight.dtype))
+        quantized_features = self.project_hid(quantized_features)
+
+        prob_replace_matrix = torch.empty(transformer_features.size(0), transformer_features.size(1)).fill_(
+            self.config.replace_prob
+        )
+        prob_replace_matrix = prob_replace_matrix.transpose(0, 1)
+        sampled_replace_matrix = torch.bernoulli(prob_replace_matrix).bool().to(transformer_features.device)
+        sampled_replace_matrix = sampled_replace_matrix.transpose(0, 1)
+        sampled_replace_matrix = sampled_replace_matrix.unsqueeze(-1)
+        logits = transformer_features.masked_fill(sampled_replace_matrix, 0.0) + (
+            quantized_features.masked_fill(~sampled_replace_matrix, 0.0)
+        )
+
+        # project to ctc units
+        logits = self.dropout(logits)
+        logits = self.ctc_proj(logits)
+
+        # TODO(PVP) - add negative sampling & loss computation
+        loss = None
+        if not return_dict:
+            if loss is not None:
+                return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+            return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+
+        return UniSpeechForPreTrainingOutput(
+            loss=loss,
+            projected_states=transformer_features,
+            projected_quantized_states=quantized_features,
+            codevector_perplexity=codevector_perplexity,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class UniSpeechForCTC(Wav2Vec2ForCTC):
+    pass
+
+
+class UniSpeechForSequenceClassification(Wav2Vec2ForSequenceClassification):
+    pass
+
+
+__all__ = [
+    "UniSpeechForCTC",
+    "UniSpeechForPreTraining",
+    "UniSpeechForSequenceClassification",
+    "UniSpeechModel",
+    "UniSpeechPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..614f80d221131a42e162d79490b5b1efa46680d9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/configuration_unispeech_sat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/configuration_unispeech_sat.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c79e7e2c348a2a3405630905cc6ba0fb9334e4c3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/configuration_unispeech_sat.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/modeling_unispeech_sat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/modeling_unispeech_sat.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bcfb3da23fc7b5986870a0ff0581adfec5bf0ff
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/modeling_unispeech_sat.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/modular_unispeech_sat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/modular_unispeech_sat.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2d5fa35fac6543e1ba8099dbcb93cf92aa96e09
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech_sat/__pycache__/modular_unispeech_sat.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..418e017d34f3fd75603f6294e26703ad239f50af
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/configuration_univnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/configuration_univnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ced8cc30ac718592a38863c5c78a1f487d6dc713
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/configuration_univnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/feature_extraction_univnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/feature_extraction_univnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b1fd42c94ee1c8f7a7967eae2d5acaa584c29ac
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/feature_extraction_univnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/modeling_univnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/modeling_univnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c79f48b6009613c9a8d733cbe8424901dc58d80
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/univnet/__pycache__/modeling_univnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..613aae114b339649af52e6000dfa52ee18438f5f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_vision_encoder_decoder import *
+    from .modeling_flax_vision_encoder_decoder import *
+    from .modeling_tf_vision_encoder_decoder import *
+    from .modeling_vision_encoder_decoder import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..248bf73ff9faf4ec54388c7fd947a1e3a3333b13
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
@@ -0,0 +1,215 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Any, Optional
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+from ..auto.configuration_auto import AutoConfig
+
+
+if TYPE_CHECKING:
+    from ... import PreTrainedTokenizerBase, TensorType
+
+logger = logging.get_logger(__name__)
+
+
+class VisionEncoderDecoderConfig(PretrainedConfig):
+    r"""
+    [`VisionEncoderDecoderConfig`] is the configuration class to store the configuration of a
+    [`VisionEncoderDecoderModel`]. It is used to instantiate a Vision-Encoder-Text-Decoder model according to the
+    specified arguments, defining the encoder and decoder configs.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        kwargs (*optional*):
+            Dictionary of keyword arguments. Notably:
+
+                - **encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
+                  the encoder config.
+                - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
+                  the decoder config.
+
+    Examples:
+
+    ```python
+    >>> from transformers import BertConfig, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel
+
+    >>> # Initializing a ViT & BERT style configuration
+    >>> config_encoder = ViTConfig()
+    >>> config_decoder = BertConfig()
+
+    >>> config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+
+    >>> # Initializing a ViTBert model (with random weights) from a ViT & google-bert/bert-base-uncased style configurations
+    >>> model = VisionEncoderDecoderModel(config=config)
+
+    >>> # Accessing the model configuration
+    >>> config_encoder = model.config.encoder
+    >>> config_decoder = model.config.decoder
+    >>> # set decoder config to causal lm
+    >>> config_decoder.is_decoder = True
+    >>> config_decoder.add_cross_attention = True
+
+    >>> # Saving the model, including its configuration
+    >>> model.save_pretrained("my-model")
+
+    >>> # loading model and config from pretrained folder
+    >>> encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained("my-model")
+    >>> model = VisionEncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config)
+    ```"""
+
+    model_type = "vision-encoder-decoder"
+    sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig}
+    has_no_defaults_at_init = True
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if "encoder" not in kwargs or "decoder" not in kwargs:
+            raise ValueError(
+                f"A configuration of type {self.model_type} cannot be instantiated because "
+                f"not both `encoder` and `decoder` sub-configurations are passed, but only {kwargs}"
+            )
+
+        encoder_config = kwargs.pop("encoder")
+        encoder_model_type = encoder_config.pop("model_type")
+        decoder_config = kwargs.pop("decoder")
+        decoder_model_type = decoder_config.pop("model_type")
+
+        self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
+        self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
+        self.is_encoder_decoder = True
+
+    @classmethod
+    def from_encoder_decoder_configs(
+        cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
+    ) -> PretrainedConfig:
+        r"""
+        Instantiate a [`VisionEncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model
+        configuration and decoder model configuration.
+
+        Returns:
+            [`VisionEncoderDecoderConfig`]: An instance of a configuration object
+        """
+        logger.info("Setting `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
+        decoder_config.is_decoder = True
+        decoder_config.add_cross_attention = True
+
+        return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
+
+
+class VisionEncoderDecoderEncoderOnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict({"last_hidden_state": {0: "batch", 1: "encoder_sequence"}})
+
+
+class VisionEncoderDecoderDecoderOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict()
+        common_inputs["input_ids"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
+        common_inputs["attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
+        common_inputs["encoder_hidden_states"] = {0: "batch", 1: "encoder_sequence"}
+
+        return common_inputs
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: "PreTrainedTokenizerBase",
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional["TensorType"] = None,
+    ) -> Mapping[str, Any]:
+        import torch
+
+        common_inputs = OrderedDict()
+
+        dummy_input = super().generate_dummy_inputs(
+            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
+        )
+
+        batch, encoder_sequence = dummy_input["input_ids"].shape
+        encoder_hidden_states_shape = (batch, encoder_sequence, self._config.encoder_hidden_size)
+        common_inputs["input_ids"] = dummy_input.pop("input_ids")
+        common_inputs["attention_mask"] = dummy_input.pop("attention_mask")
+        common_inputs["encoder_hidden_states"] = torch.zeros(encoder_hidden_states_shape)
+
+        return common_inputs
+
+
+class VisionEncoderDecoderOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> None:
+        pass
+
+    def get_encoder_config(self, encoder_config: PretrainedConfig) -> OnnxConfig:
+        r"""
+        Returns ONNX encoder config for `VisionEncoderDecoder` model.
+
+        Args:
+            encoder_config (`PretrainedConfig`):
+                The encoder model's configuration to use when exporting to ONNX.
+
+        Returns:
+            [`VisionEncoderDecoderEncoderOnnxConfig`]: An instance of the ONNX configuration object
+        """
+        return VisionEncoderDecoderEncoderOnnxConfig(encoder_config)
+
+    def get_decoder_config(
+        self, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, feature: str = "default"
+    ) -> OnnxConfig:
+        r"""
+        Returns ONNX decoder config for `VisionEncoderDecoder` model.
+
+        Args:
+            encoder_config (`PretrainedConfig`):
+                The encoder model's configuration to use when exporting to ONNX.
+            decoder_config (`PretrainedConfig`):
+                The decoder model's configuration to use when exporting to ONNX
+            feature (`str`, *optional*):
+                The type of feature to export the model with.
+
+        Returns:
+            [`VisionEncoderDecoderDecoderOnnxConfig`]: An instance of the ONNX configuration object.
+        """
+        decoder_config.encoder_hidden_size = encoder_config.hidden_size
+        return VisionEncoderDecoderDecoderOnnxConfig(decoder_config, feature)
+
+
+__all__ = ["VisionEncoderDecoderConfig", "VisionEncoderDecoderOnnxConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a59c799cc04afa51c0f2156baa06a88683f1dbf6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
@@ -0,0 +1,864 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Classes to support Vision-Encoder-Text-Decoder architectures"""
+
+import os
+from typing import Optional, Union
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+from jax.random import PRNGKey
+
+from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput
+from ...modeling_flax_utils import FlaxPreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ..auto.configuration_auto import AutoConfig
+from ..auto.modeling_flax_auto import FlaxAutoModel, FlaxAutoModelForCausalLM
+from .configuration_vision_encoder_decoder import VisionEncoderDecoderConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "VisionEncoderDecoderConfig"
+
+VISION_ENCODER_DECODER_START_DOCSTRING = r"""
+    This class can be used to initialize an image-to-text-sequence model with any pretrained vision autoencoding model
+    as the encoder and any pretrained text autoregressive model as the decoder. The encoder is loaded via
+    [`~AutoModel.from_pretrained`] function and the decoder is loaded via [`~AutoModelForCausalLM.from_pretrained`]
+    function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream
+    generative task, like image captioning.
+
+    The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
+    tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
+    Tasks](https://huggingface.co/papers/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
+    Zhou, Wei Li, Peter J. Liu.
+
+    Additionally, in [TrOCR: Transformer-based Optical Character Recognition with Pre-trained
+    Models](https://huggingface.co/papers/2109.10282) it is shown how leveraging large pretrained vision models for optical
+    character recognition (OCR) yields a significant performance improvement.
+
+    After such a Vision-Encoder-Text-Decoder model has been trained/fine-tuned, it can be saved/loaded just like any
+    other models (see the examples for more information).
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`VisionEncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+VISION_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using the vision model's image processor. For example, using
+            [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`] for details.
+        decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        decoder_position_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.decoder.max_position_embeddings - 1]`.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            If set to `True`, the model will return a [`~utils.FlaxSeq2SeqLMOutput`] instead of a plain tuple.
+"""
+
+VISION_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using the vision model's image processor. For example, using
+            [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            If set to `True`, the model will return a [`~utils.FlaxBaseModelOutput`] instead of a plain tuple.
+"""
+
+VISION_ENCODER_DECODER_DECODE_INPUTS_DOCSTRING = r"""
+    Args:
+        decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            For sequence to sequence training, `decoder_input_ids` should be provided. If no `decoder_input_ids` is
+            provided, the model will create this tensor by shifting the `input_ids` to the right for denoising
+            pre-training.
+        encoder_outputs (`tuple(tuple(jnp.ndarray)`):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        decoder_position_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.decoder.max_position_embeddings - 1]`.
+        past_key_values (`dict[str, jnp.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            If set to `True`, the model will return a [`~utils.FlaxCausalLMOutputWithCrossAttentions`] instead of a
+            plain tuple.
+"""
+
+
+class FlaxVisionEncoderDecoderModule(nn.Module):
+    config: VisionEncoderDecoderConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        encoder_config = self.config.encoder
+        decoder_config = self.config.decoder
+
+        # Copied from `modeling_hybrid_clip.py` with modifications.
+        from ...models.auto.modeling_flax_auto import FLAX_MODEL_FOR_CAUSAL_LM_MAPPING, FLAX_MODEL_MAPPING
+
+        encoder_module = FLAX_MODEL_MAPPING[encoder_config.__class__].module_class
+        decoder_module = FLAX_MODEL_FOR_CAUSAL_LM_MAPPING[decoder_config.__class__].module_class
+
+        self.encoder = encoder_module(encoder_config, dtype=self.dtype)
+        self.decoder = decoder_module(decoder_config, dtype=self.dtype)
+
+        # encoder outputs might need to be projected to different dimension for decoder
+        if (
+            self.encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            self.enc_to_dec_proj = nn.Dense(
+                self.decoder.config.hidden_size,
+                kernel_init=jax.nn.initializers.normal(self.decoder.config.initializer_range),
+                dtype=self.dtype,
+            )
+        else:
+            self.enc_to_dec_proj = None
+
+    def _get_encoder_module(self):
+        return self.encoder
+
+    def _get_projection_module(self):
+        return self.enc_to_dec_proj
+
+    def _get_decoder_module(self):
+        return self.decoder
+
+    def __call__(
+        self,
+        pixel_values,
+        decoder_input_ids,
+        decoder_attention_mask,
+        decoder_position_ids,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        encoder_outputs = self.encoder(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        encoder_hidden_states = encoder_outputs[0]
+
+        # optionally project encoder_hidden_states
+        if self.enc_to_dec_proj is not None:
+            encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
+
+        # The advantage of explicitly setting this is TPU XLA compiler knows as soon as possible what shape this
+        # variable has and can better optimize. Also passing `None` can lead to some problems when jitting the model.
+        # In Flax/JAX, we only want to pass `None` for non-tensor function inputs. For all tensor function inputs, we
+        # should always pass a tensor and not `None`.
+        batch_size, sequence_length = encoder_hidden_states.shape[:2]
+        encoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return FlaxSeq2SeqLMOutput(
+            logits=decoder_outputs.logits,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(VISION_ENCODER_DECODER_START_DOCSTRING)
+class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
+    r"""
+    [`FlaxVisionEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture
+    with the module (flax.nn.Module) of one of the base vision model classes of the library as encoder module and
+    another one as decoder module when created with the :meth*~transformers.FlaxAutoModel.from_pretrained* class method
+    for the encoder and :meth*~transformers.FlaxAutoModelForCausalLM.from_pretrained* class method for the decoder.
+    """
+
+    config_class = VisionEncoderDecoderConfig
+    base_model_prefix = "vision_encoder_decoder"
+    main_input_name = "pixel_values"
+    module_class = FlaxVisionEncoderDecoderModule
+
+    def __init__(
+        self,
+        config: VisionEncoderDecoderConfig,
+        input_shape: Optional[tuple] = None,
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        if not _do_init:
+            raise ValueError(
+                "`FlaxVisionEncoderDecoderModel` cannot be created without initializing, `_do_init` must be `True`."
+            )
+
+        if input_shape is None:
+            num_channels = getattr(config.encoder, "num_channels", 3)
+            input_shape = (
+                (1, config.encoder.image_size, config.encoder.image_size, num_channels),
+                (1, 1),
+            )
+
+        if config.decoder.cross_attention_hidden_size is not None:
+            if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
+                raise ValueError(
+                    "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal"
+                    f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
+                    f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for"
+                    " `config.encoder.hidden_size`."
+                )
+
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        encoder_input_shape, decoder_input_shape = input_shape
+
+        # init input tensors
+        pixel_values = jnp.zeros(encoder_input_shape, dtype=self.dtype)
+        decoder_input_ids = jnp.zeros(decoder_input_shape, dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+
+        batch_size, _, _, _ = pixel_values.shape
+        decoder_batch_size, decoder_sequence_length = decoder_input_ids.shape
+        if not decoder_batch_size == batch_size:
+            raise ValueError(
+                f"The inputs of encoder and decoder should have the same batch size, but got {batch_size} for encoder "
+                f"and {decoder_batch_size} for decoder."
+            )
+        decoder_position_ids = jnp.broadcast_to(
+            jnp.arange(decoder_sequence_length)[None, :], (decoder_batch_size, decoder_sequence_length)
+        )
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(
+            rngs,
+            pixel_values,
+            decoder_input_ids,
+            decoder_attention_mask,
+            decoder_position_ids,
+        )["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    def init_cache(self, batch_size, max_length, encoder_outputs):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+                `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+                `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*)
+                is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+                cross-attention of the decoder.
+        """
+        # init input variables to retrieve cache
+        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        decoder_position_ids = jnp.broadcast_to(
+            jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape
+        )
+
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                input_ids=decoder_input_ids,
+                attention_mask=decoder_attention_mask,
+                position_ids=decoder_position_ids,
+                **kwargs,
+            )
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0),
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            init_cache=True,
+            method=_decoder_forward,  # we only need to call the decoder to init the cache
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings(VISION_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def encode(
+        self,
+        pixel_values: jnp.ndarray,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: Optional[dict] = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, FlaxVisionEncoderDecoderModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+
+        >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
+        ... )
+
+        >>> pixel_values = image_processor(images=image, return_tensors="np").pixel_values
+        >>> encoder_outputs = model.encode(pixel_values)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # `FlaxViTModel` expects channel first format, but `FlaxViTModule` expects channel last format.
+        # Currently, we assume this holds for all Flax vision models, and perform a transpose here.
+        pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        def _encoder_forward(module, pixel_values, **kwargs):
+            encode_module = module._get_encoder_module()
+            return encode_module(pixel_values, **kwargs)
+
+        outputs = self.module.apply(
+            {"params": params or self.params},
+            pixel_values=jnp.array(pixel_values, dtype=self.dtype),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            method=_encoder_forward,
+        )
+
+        if return_dict:
+            outputs = FlaxBaseModelOutput(
+                last_hidden_state=outputs.last_hidden_state,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+
+        return outputs
+
+    @add_start_docstrings(VISION_ENCODER_DECODER_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: Optional[dict] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: Optional[dict] = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, FlaxVisionEncoderDecoderModel
+        >>> import jax.numpy as jnp
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+
+        >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
+        ... )
+
+        >>> pixel_values = image_processor(images=image, return_tensors="np").pixel_values
+        >>> encoder_outputs = model.encode(pixel_values)
+
+        >>> decoder_start_token_id = model.config.decoder.bos_token_id
+        >>> decoder_input_ids = jnp.ones((pixel_values.shape[0], 1), dtype="i4") * decoder_start_token_id
+
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> logits = outputs.logits
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        encoder_hidden_states = encoder_outputs[0]
+
+        batch_size, sequence_length = encoder_hidden_states.shape[:2]
+        encoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+
+            decoder_position_ids = jnp.broadcast_to(
+                jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+            )
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxBartAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        def _decoder_forward(
+            module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs
+        ):
+            projection_module = module._get_projection_module()
+            decoder_module = module._get_decoder_module()
+
+            # optionally project encoder_hidden_states
+            if projection_module is not None:
+                encoder_hidden_states = projection_module(encoder_hidden_states)
+
+            return decoder_module(
+                decoder_input_ids,
+                decoder_attention_mask,
+                decoder_position_ids,
+                encoder_hidden_states,
+                **kwargs,
+            )
+
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past = outputs
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past = outputs
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+
+        return outputs
+
+    @add_start_docstrings_to_model_forward(VISION_ENCODER_DECODER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def __call__(
+        self,
+        pixel_values: jnp.ndarray,
+        decoder_input_ids: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: Optional[dict] = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import FlaxVisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+
+        >>> # load output tokenizer
+        >>> tokenizer_output = AutoTokenizer.from_pretrained("openai-community/gpt2")
+
+        >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
+        ... )
+
+        >>> pixel_values = image_processor(images=image, return_tensors="np").pixel_values
+
+        >>> # use GPT2's eos_token as the pad as well as eos token
+        >>> model.config.eos_token_id = model.config.decoder.eos_token_id
+        >>> model.config.pad_token_id = model.config.eos_token_id
+
+        >>> # generation
+        >>> sequences = model.generate(pixel_values, num_beams=4, max_length=12).sequences
+
+        >>> captions = tokenizer_output.batch_decode(sequences, skip_special_tokens=True)
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # prepare encoder inputs
+
+        # `FlaxViTModel` expects channel first format, but `FlaxViTModule` expects channel last format.
+        # Currently, we assume this holds for all Flax vision models, and perform a transpose here.
+        pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1))
+
+        # prepare decoder inputs
+        if decoder_input_ids is None:
+            raise ValueError("`decoder_input_ids` can't be `None`.")
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        if decoder_position_ids is None:
+            batch_size, sequence_length = decoder_input_ids.shape
+            decoder_position_ids = jnp.broadcast_to(
+                jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+            )
+
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+
+        return self.module.apply(
+            {"params": params or self.params},
+            pixel_values=jnp.array(pixel_values, dtype=self.dtype),
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        max_length,
+        decoder_attention_mask: Optional[jax.Array] = None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # initializing the cache
+        batch_size, seq_length = decoder_input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if decoder_attention_mask is not None:
+            decoder_position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
+        else:
+            decoder_position_ids = jnp.broadcast_to(
+                jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)
+            )
+
+        return {
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "decoder_attention_mask": extended_attention_mask,
+            "decoder_position_ids": decoder_position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
+        return model_kwargs
+
+    @classmethod
+    def from_encoder_decoder_pretrained(
+        cls,
+        encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        *model_args,
+        **kwargs,
+    ) -> FlaxPreTrainedModel:
+        r"""
+        Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
+        checkpoints.
+
+        Params:
+            encoder_pretrained_model_name_or_path (`Union[str, os.PathLike]`, *optional*):
+                Information necessary to initiate the encoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. An
+                      example is `google/vit-base-patch16-224-in21k`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+
+            decoder_pretrained_model_name_or_path (`Union[str, os.PathLike]`, *optional*, defaults to `None`):
+                Information necessary to initiate the decoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                    - A path to a *directory* containing model weights saved using
+                      [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+
+            model_args (remaining positional arguments, *optional*):
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
+
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`).
+
+                - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter.
+                - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
+                - To update the parent model configuration, do not use a prefix for each configuration parameter.
+
+                Behaves differently depending on whether a `config` is provided or automatically loaded.
+
+        Example:
+
+        ```python
+        >>> from transformers import FlaxVisionEncoderDecoderModel
+
+        >>> # initialize a vit-gpt2 from a pretrained ViT and a pretrained GPT2 model. Note that the cross-attention layers will be randomly initialized
+        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
+        ... )
+        >>> # saving model after fine-tuning
+        >>> model.save_pretrained("./vit-gpt2")
+        >>> # load fine-tuned model
+        >>> model = FlaxVisionEncoderDecoderModel.from_pretrained("./vit-gpt2")
+        ```"""
+
+        kwargs_encoder = {
+            argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
+        }
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+
+        # remove encoder, decoder kwargs from kwargs
+        for key in kwargs_encoder:
+            del kwargs["encoder_" + key]
+        for key in kwargs_decoder:
+            del kwargs["decoder_" + key]
+
+        # Load and initialize the encoder and decoder
+        # The distinction between encoder and decoder at the model level is made
+        # by the value of the flag `is_decoder` that we need to set correctly.
+        encoder = kwargs_encoder.pop("model", None)
+        if encoder is None:
+            if encoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_encoder:
+                encoder_config = AutoConfig.from_pretrained(encoder_pretrained_model_name_or_path)
+                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+                    logger.info(
+                        f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model "
+                        "from a decoder model. Cross-attention and causal mask are disabled."
+                    )
+                    encoder_config.is_decoder = False
+                    encoder_config.add_cross_attention = False
+
+                kwargs_encoder["config"] = encoder_config
+
+            encoder = FlaxAutoModel.from_pretrained(
+                encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder
+            )
+
+        decoder = kwargs_decoder.pop("model", None)
+        if decoder is None:
+            if decoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_decoder:
+                decoder_config = AutoConfig.from_pretrained(decoder_pretrained_model_name_or_path)
+                if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
+                    logger.info(
+                        f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
+                        f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
+                        f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
+                    )
+                    decoder_config.is_decoder = True
+                    decoder_config.add_cross_attention = True
+
+                kwargs_decoder["config"] = decoder_config
+
+            if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
+                logger.warning(
+                    f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
+                    f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
+                    "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
+                    "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a "
+                    "`decoder_config` to `.from_encoder_decoder_pretrained(...)`"
+                )
+
+            decoder = FlaxAutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
+
+        # instantiate config with corresponding kwargs
+        dtype = kwargs.pop("dtype", jnp.float32)
+        config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
+
+        # init model
+        model = cls(config, dtype=dtype)
+        model.params["encoder"] = encoder.params
+        model.params["decoder"] = decoder.params
+
+        return model
+
+
+__all__ = ["FlaxVisionEncoderDecoderModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef2ea2109987beabf5f8dc7b758d055f0a543bbf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
@@ -0,0 +1,696 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Classes to support TF Vision-Encoder-Text-Decoder architectures"""
+
+from __future__ import annotations
+
+import re
+import warnings
+
+import numpy as np
+import tensorflow as tf
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput
+from ...modeling_tf_utils import TFCausalLanguageModelingLoss, TFPreTrainedModel, get_initializer, keras, unpack_inputs
+from ...tf_utils import shape_list
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..auto.configuration_auto import AutoConfig
+from ..auto.modeling_tf_auto import TFAutoModel, TFAutoModelForCausalLM
+from .configuration_vision_encoder_decoder import VisionEncoderDecoderConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "VisionEncoderDecoderConfig"
+
+DEPRECATION_WARNING = (
+    "Version v4.17.0 introduces a better way to train encoder-decoder models by computing the loss inside the"
+    " encoder-decoder framework rather than in the decoder itself. You may observe training discrepancies if"
+    " fine-tuning a model trained with versions anterior to 4.17.0. The decoder_input_ids are now created based on the"
+    " labels, no need to pass them yourself anymore."
+)
+
+VISION_ENCODER_DECODER_START_DOCSTRING = r"""
+    This class can be used to initialize an image-to-text-sequence model with any pretrained vision autoencoding model
+    as the encoder and any pretrained text autoregressive model as the decoder. The encoder is loaded via
+    [`~TFAutoModel.from_pretrained`] function and the decoder is loaded via [`~TFAutoModelForCausalLM.from_pretrained`]
+    function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream
+    generative task, like image captioning.
+
+    The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
+    tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
+    Tasks](https://huggingface.co/papers/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
+    Zhou, Wei Li, Peter J. Liu.
+
+    Additionally, in [TrOCR: Transformer-based Optical Character Recognition with Pre-trained
+    Models](https://huggingface.co/papers/2109.10282) it is shown how leveraging large pretrained vision models for optical
+    character recognition (OCR) yields a significant performance improvement.
+
+    After such a Vision-Encoder-Text-Decoder model has been trained/fine-tuned, it can be saved/loaded just like any
+    other models (see the examples for more information).
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`VisionEncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VISION_ENCODER_DECODER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using the vision's model's image processor. For example, using
+            [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`] for details.
+        decoder_input_ids (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            Provide for sequence to sequence training to the decoder. Indices can be obtained using
+            [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            details.
+        decoder_attention_mask (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        encoder_outputs (`tuple(tuple(tf.Tensor)`, *optional*):
+            This tuple must consist of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` (`tf.Tensor` of shape `({0}, hidden_size)`) is a tensor of hidden-states at the output
+            of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(tf.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `({0})`.
+        decoder_inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. This is useful if you want more control over how to convert `decoder_input_ids` indices
+            into associated vectors than the model's internal embedding lookup matrix.
+        labels (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
+            Labels for computing the masked language modeling loss for the decoder. Indices should be in `[-100, 0,
+            ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            If set to `True`, the model will return a [`~utils.Seq2SeqLMOutput`] instead of a plain tuple.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+        kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments come in two flavors:
+
+            - Without a prefix which will be input as `**encoder_kwargs` for the encoder forward function.
+            - With a *decoder_* prefix which will be input as `**decoder_kwargs` for the decoder forward function.
+"""
+
+
+# Copied from transformers.models.encoder_decoder.modeling_tf_encoder_decoder.shift_tokens_right
+def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    if pad_token_id is None:
+        raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
+    pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
+
+    if decoder_start_token_id is None:
+        raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
+    decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)
+
+    start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id)
+    shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids = tf.where(
+        shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids
+    )
+
+    # "Verify that `labels` has only positive values and -100"
+    assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))
+
+    # Make sure the assertion op is called by wrapping the result in an identity no-op
+    with tf.control_dependencies([assert_gte0]):
+        shifted_input_ids = tf.identity(shifted_input_ids)
+
+    return shifted_input_ids
+
+
+@add_start_docstrings(VISION_ENCODER_DECODER_START_DOCSTRING)
+class TFVisionEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss):
+    r"""
+    [`TFVisionEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture
+    with one of the base vision model classes of the library as encoder and another one of the base model classes as
+    decoder when created with the [`~TFAutoModel.from_pretrained`] class method for the encoder and
+    [`~TFAutoModelForCausalLM.from_pretrained`] class method for the decoder.
+    """
+
+    config_class = VisionEncoderDecoderConfig
+    base_model_prefix = "vision_encoder_decoder"
+    load_weight_prefix = "tf_vision_encoder_decoder_model"
+    main_input_name = "pixel_values"
+
+    def __init__(
+        self,
+        config: PretrainedConfig | None = None,
+        encoder: TFPreTrainedModel | None = None,
+        decoder: TFPreTrainedModel | None = None,
+    ):
+        if config is None and (encoder is None or decoder is None):
+            raise ValueError("Either a configuration or an encoder and a decoder has to be provided.")
+        if config is None:
+            config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config)
+        else:
+            if not isinstance(config, self.config_class):
+                raise ValueError(f"config: {config} has to be of type {self.config_class}")
+
+        if config.decoder.cross_attention_hidden_size is not None:
+            if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
+                raise ValueError(
+                    "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal"
+                    f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
+                    f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for"
+                    " `config.encoder.hidden_size`."
+                )
+
+        # initialize with config
+        super().__init__(config)
+
+        if encoder is None:
+            encoder = TFAutoModel.from_config(config.encoder, name="encoder")
+
+        if decoder is None:
+            decoder = TFAutoModelForCausalLM.from_config(config.decoder, name="decoder")
+
+        self.encoder = encoder
+        self.decoder = decoder
+
+        if self.encoder.config.to_dict() != self.config.encoder.to_dict():
+            logger.warning(
+                f"Config of the encoder: {self.encoder.__class__} is overwritten by shared encoder config:"
+                f" {self.config.encoder}"
+            )
+        if self.decoder.config.to_dict() != self.config.decoder.to_dict():
+            logger.warning(
+                f"Config of the decoder: {self.decoder.__class__} is overwritten by shared decoder config:"
+                f" {self.config.decoder}"
+            )
+
+        # make sure that the individual model's config refers to the shared config
+        # so that the updates to the config will be synced
+        self.encoder.config = self.config.encoder
+        self.decoder.config = self.config.decoder
+
+        # encoder outputs might need to be projected to different dimension for decoder
+        if (
+            self.encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            self.enc_to_dec_proj = keras.layers.Dense(
+                units=self.decoder.config.hidden_size,
+                kernel_initializer=get_initializer(config.encoder.initializer_range),
+                name="enc_to_dec_proj",
+            )
+
+        if self.encoder.get_output_embeddings() is not None:
+            raise ValueError(
+                f"The encoder {self.encoder} should not have a LM Head. Please use a model without LM Head"
+            )
+
+    @property
+    def input_signature(self):
+        vision_config = self.config.encoder
+        if hasattr(vision_config, "vision_config"):
+            vision_config = vision_config.vision_config
+        if hasattr(vision_config, "image_size"):
+            image_size = vision_config.image_size
+        else:
+            image_size = vision_config.input_size
+        return {
+            "pixel_values": tf.TensorSpec(
+                shape=(
+                    None,
+                    vision_config.num_channels,
+                    image_size,
+                    image_size,
+                ),
+                dtype=tf.float32,
+            ),
+            "decoder_input_ids": tf.TensorSpec(shape=(None, None), dtype=tf.int32, name="decoder_input_ids"),
+        }
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_input_embeddings(self):
+        return self.encoder.get_input_embeddings()
+
+    def get_output_embeddings(self):
+        return self.decoder.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        return self.decoder.set_output_embeddings(new_embeddings)
+
+    def tf_to_pt_weight_rename(self, tf_weight):
+        # Matt: The TF and PT weights don't align because our TF base classes have an extra layer compared to PT models
+        # (the main model stem is in the MainLayer class). If we remove that layer, then weight names sync up as normal.
+        # However, the name of that extra layer is the name of the MainLayer in the base model. We make the assumption
+        # here that the config model_type is the same as the name of the MainLayer. I don't know of anywhere that's
+        # not the case, and I wasn't sure how else to go from the config to the correct MainLayer name!
+
+        # This override is only needed in the case where we're crossloading weights from PT. However, since weights are
+        # often safetensors now, we don't know if we're going to be crossloading until we sniff the weights file.
+        # Therefore, we specify tf_to_pt_weight_rename anyway, and let the super method figure out if it needs it
+        # or not.
+        encoder_model_type = self.config.encoder.model_type
+        if "encoder" in tf_weight and "decoder" not in tf_weight:
+            return (re.sub(rf"encoder\.{encoder_model_type}\.", "encoder.", tf_weight),)
+        else:
+            return (tf_weight,)
+
+    @classmethod
+    def from_encoder_decoder_pretrained(
+        cls,
+        encoder_pretrained_model_name_or_path: str | None = None,
+        decoder_pretrained_model_name_or_path: str | None = None,
+        *model_args,
+        **kwargs,
+    ) -> TFPreTrainedModel:
+        r"""
+        Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
+        checkpoints.
+
+
+        Params:
+            encoder_pretrained_model_name_or_path (`str`, *optional*):
+                Information necessary to initiate the encoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. An
+                      example is `google/vit-base-patch16-224-in21k`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *pytorch index checkpoint file* (e.g, `./pt_model/`). In this case,
+                      `encoder_from_pt` should be set to `True`.
+
+            decoder_pretrained_model_name_or_path (`str`, *optional*, defaults to *None*):
+                Information necessary to initiate the decoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                    - A path to a *directory* containing model weights saved using
+                      [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *pytorch checkpoint file* (e.g, `./pt_model/`). In this case,
+                      `decoder_from_pt` should be set to `True`.
+
+            model_args (remaining positional arguments, *optional*):
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
+
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`).
+
+                - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter.
+                - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
+                - To update the parent model configuration, do not use a prefix for each configuration parameter.
+
+                Behaves differently depending on whether a `config` is provided or automatically loaded.
+
+        Example:
+
+        ```python
+        >>> from transformers import TFVisionEncoderDecoderModel
+
+        >>> # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
+        >>> model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "google/vit-base-patch16-224-in21k", "google-bert/bert-base-uncased"
+        ... )
+        >>> # saving model after fine-tuning
+        >>> model.save_pretrained("./vit-bert")
+        >>> # load fine-tuned model
+        >>> model = TFVisionEncoderDecoderModel.from_pretrained("./vit-bert")
+        ```"""
+
+        kwargs_encoder = {
+            argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
+        }
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+
+        # remove encoder, decoder kwargs from kwargs
+        for key in kwargs_encoder:
+            del kwargs["encoder_" + key]
+        for key in kwargs_decoder:
+            del kwargs["decoder_" + key]
+
+        # Load and initialize the encoder and decoder
+        # The distinction between encoder and decoder at the model level is made
+        # by the value of the flag `is_decoder` that we need to set correctly.
+        encoder = kwargs_encoder.pop("model", None)
+        if encoder is None:
+            if encoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_encoder:
+                encoder_config = AutoConfig.from_pretrained(encoder_pretrained_model_name_or_path)
+                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+                    logger.info(
+                        f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model "
+                        "from a decoder model. Cross-attention and causal mask are disabled."
+                    )
+                    encoder_config.is_decoder = False
+                    encoder_config.add_cross_attention = False
+
+                kwargs_encoder["config"] = encoder_config
+
+            kwargs_encoder["name"] = "encoder"
+            kwargs_encoder["load_weight_prefix"] = cls.load_weight_prefix
+            encoder = TFAutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
+
+        decoder = kwargs_decoder.pop("model", None)
+        if decoder is None:
+            if decoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_decoder:
+                decoder_config = AutoConfig.from_pretrained(decoder_pretrained_model_name_or_path)
+                if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
+                    logger.info(
+                        f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
+                        f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
+                        f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
+                    )
+                    decoder_config.is_decoder = True
+                    decoder_config.add_cross_attention = True
+
+                kwargs_decoder["config"] = decoder_config
+
+            if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
+                logger.warning(
+                    f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
+                    f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
+                    "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
+                    "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a "
+                    "`decoder_config` to `.from_encoder_decoder_pretrained(...)`"
+                )
+
+            kwargs_decoder["name"] = "decoder"
+            kwargs_decoder["load_weight_prefix"] = cls.load_weight_prefix
+            decoder = TFAutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
+
+        # Make sure these 2 `keras.Model` have fixed names so `from_pretrained` could load model weights correctly.
+        if encoder.name != "encoder":
+            raise ValueError("encoder model must be created with the name `encoder`.")
+        if decoder.name != "decoder":
+            raise ValueError("decoder model must be created with the name `decoder`.")
+
+        # instantiate config with corresponding kwargs
+        config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
+        return cls(encoder=encoder, decoder=decoder, config=config)
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(
+        VISION_ENCODER_DECODER_INPUTS_DOCSTRING.format("batch_size, sequence_length")
+    )
+    @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        pixel_values: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: tuple | TFBaseModelOutput | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+        **kwargs,
+    ) -> TFSeq2SeqLMOutput | tuple[tf.Tensor]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoTokenizer, TFVisionEncoderDecoderModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+        >>> decoder_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+
+        >>> # initialize a bert2gpt2 from a pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+        >>> model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "google/vit-base-patch16-224-in21k", "openai-community/gpt2"
+        ... )
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> img = Image.open(requests.get(url, stream=True).raw)
+
+        >>> # forward
+        >>> pixel_values = image_processor(images=img, return_tensors="tf").pixel_values  # Batch size 1
+        >>> decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids  # Batch size 1
+        >>> outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids)
+
+        >>> # training
+        >>> outputs = model(pixel_values=pixel_values, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids)
+        >>> loss, logits = outputs.loss, outputs.logits
+
+        >>> # save and load from pretrained
+        >>> model.save_pretrained("vit-gpt2")
+        >>> model = TFVisionEncoderDecoderModel.from_pretrained("vit-gpt2")
+
+        >>> # generation
+        >>> generated = model.generate(pixel_values, decoder_start_token_id=model.config.decoder.bos_token_id)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+
+        # Let the user be responsible for the expected format.
+        if encoder_outputs is not None:
+            if return_dict and not isinstance(encoder_outputs, ModelOutput):
+                raise ValueError(
+                    "If `return_dict=True` and `encoder_outputs` is provided, it should be an instance of "
+                    f"`ModelOutput`. Got an instance {type(encoder_outputs)} for `encoder_outputs`."
+                )
+
+        if encoder_outputs is None:
+            encoder_inputs = {
+                "input_ids": pixel_values,
+                "output_attentions": output_attentions,
+                "output_hidden_states": output_hidden_states,
+                "return_dict": return_dict,
+                "training": training,
+            }
+
+            # Add arguments to encoder from `kwargs_encoder`
+            encoder_inputs.update(kwargs_encoder)
+
+            if "input_ids" in encoder_inputs:
+                encoder_inputs["pixel_values"] = encoder_inputs.pop("input_ids")
+
+            if encoder_inputs["pixel_values"] is None:
+                raise ValueError("You have to specify pixel_values")
+
+            # Handle the case where the inputs are passed as a single dict which contains `labels`.
+            # The `labels` shouldn't be passed to `self.encoder` below, because it is a based model without this
+            # parameter (otherwise, an error occurs when `input_processing` is called inside `self.encoder.call()`).
+            if "labels" in encoder_inputs:
+                labels = encoder_inputs.pop("labels")
+
+            # handle the init case where `dummy_inputs` returns a dict containing `decoder_input_ids`.
+            if "decoder_input_ids" in encoder_inputs:
+                decoder_input_ids = encoder_inputs.pop("decoder_input_ids")
+            # handle the init case where `dummy_inputs` returns a dict containing `decoder_input_ids`.
+            if "decoder_attention_mask" in encoder_inputs:
+                decoder_attention_mask = encoder_inputs.pop("decoder_attention_mask")
+
+            encoder_outputs = self.encoder(**encoder_inputs)
+
+        encoder_hidden_states = encoder_outputs[0]
+
+        # optionally project encoder_hidden_states
+        if (
+            self.encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
+
+        if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
+            decoder_input_ids = shift_tokens_right(
+                labels, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+
+        batch_size, sequence_length = shape_list(encoder_hidden_states)[:2]
+        encoder_attention_mask = tf.ones(shape=(batch_size, sequence_length), dtype=tf.int32)
+
+        decoder_inputs = {
+            "input_ids": decoder_input_ids,
+            "attention_mask": decoder_attention_mask,
+            "encoder_hidden_states": encoder_hidden_states,
+            "encoder_attention_mask": encoder_attention_mask,
+            "inputs_embeds": decoder_inputs_embeds,
+            "output_attentions": output_attentions,
+            "output_hidden_states": output_hidden_states,
+            "use_cache": use_cache,
+            "past_key_values": past_key_values,
+            "return_dict": return_dict,
+            "training": training,
+        }
+
+        # Add arguments to decoder from `kwargs_decoder`
+        decoder_inputs.update(kwargs_decoder)
+
+        decoder_outputs = self.decoder(**decoder_inputs)
+
+        logits = decoder_outputs[0]
+
+        # Compute loss independent from decoder (as some shift the logits inside them)
+        loss = None
+        if labels is not None:
+            warnings.warn(DEPRECATION_WARNING, FutureWarning)
+            loss = self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            past_key_values = None
+            if use_cache:
+                past_key_values = decoder_outputs[1]
+            # The starting index of the remaining elements in `decoder_outputs`
+            start_index = sum([1 if x is not None else 0 for x in (loss, logits, past_key_values)])
+
+            if not isinstance(encoder_outputs, tuple):
+                encoder_outputs = encoder_outputs.to_tuple()
+            output = (loss, logits, past_key_values) + decoder_outputs[start_index:] + encoder_outputs
+            output = tuple(x for x in output if x is not None)
+            return output
+
+        return TFSeq2SeqLMOutput(
+            loss=loss,
+            logits=decoder_outputs.logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def serving_output(self, output):
+        pkv = tf.tuple(output.past_key_values)[1] if self.config.decoder.use_cache else None
+        dec_hs = (
+            tf.convert_to_tensor(output.decoder_hidden_states) if self.config.decoder.output_hidden_states else None
+        )
+        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.decoder.output_attentions else None
+        enc_hs = (
+            tf.convert_to_tensor(output.encoder_hidden_states) if self.config.encoder.output_hidden_states else None
+        )
+        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.encoder.output_attentions else None
+        cross_attns = (
+            tf.convert_to_tensor(output.cross_attentions)
+            if self.config.decoder.output_attentions and output.cross_attentions is not None
+            else None
+        )
+
+        return TFSeq2SeqLMOutput(
+            logits=output.logits,
+            past_key_values=pkv,
+            decoder_hidden_states=dec_hs,
+            decoder_attentions=dec_attns,
+            encoder_last_hidden_state=output.encoder_last_hidden_state,
+            encoder_hidden_states=enc_hs,
+            encoder_attentions=enc_attns,
+            cross_attentions=cross_attns,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
+    ):
+        decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
+        decoder_attention_mask = decoder_inputs.get("attention_mask", None)
+        past_key_values = decoder_inputs.get("past_key_values")
+        input_dict = {
+            "pixel_values": None,  # needs to be passed to make Keras.layer.__call__ happy
+            "attention_mask": attention_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "decoder_input_ids": decoder_inputs["input_ids"],
+            # TODO (joao): the `TFBaseModelOutput` wrapper should not be needed after the generate refactor is complete
+            "encoder_outputs": TFBaseModelOutput(last_hidden_state=encoder_outputs[0]),
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+        }
+        return input_dict
+
+    def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+    def resize_token_embeddings(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Resizing the embedding layers via the TFVisionEncoderDecoderModel directly is not supported. "
+            "Please use the respective methods of the wrapped objects (model.decoder.resize_token_embeddings(...))"
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "enc_to_dec_proj", None) is not None:
+            with tf.name_scope(self.enc_to_dec_proj.name):
+                self.enc_to_dec_proj.build([None, None, self.encoder.config.hidden_size])
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "decoder", None) is not None:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build(None)
+
+
+__all__ = ["TFVisionEncoderDecoderModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6bc2dcc0f8e94e0c79c01e9f47ab2e34aedb3cd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -0,0 +1,602 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Classes to support Vision-Encoder-Text-Decoder architectures"""
+
+import gc
+import os
+import tempfile
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...cache_utils import Cache
+from ...configuration_utils import PretrainedConfig
+from ...generation import GenerationMixin
+from ...modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from ..auto.configuration_auto import AutoConfig
+from ..auto.modeling_auto import AutoModel, AutoModelForCausalLM
+from .configuration_vision_encoder_decoder import VisionEncoderDecoderConfig
+
+
+# Copied from transformers.models.encoder_decoder.modeling_encoder_decoder.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    if decoder_start_token_id is None:
+        raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+logger = logging.get_logger(__name__)
+
+
+@auto_docstring
+class VisionEncoderDecoderModel(PreTrainedModel, GenerationMixin):
+    r"""
+    [`VisionEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with
+    one of the base vision model classes of the library as encoder and another one as decoder when created with the
+    :meth*~transformers.AutoModel.from_pretrained* class method for the encoder and
+    :meth*~transformers.AutoModelForCausalLM.from_pretrained* class method for the decoder.
+    """
+
+    config: VisionEncoderDecoderConfig
+    base_model_prefix = "vision_encoder_decoder"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _supports_param_buffer_assignment = False
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    def __init__(
+        self,
+        config: Optional[PretrainedConfig] = None,
+        encoder: Optional[PreTrainedModel] = None,
+        decoder: Optional[PreTrainedModel] = None,
+    ):
+        r"""
+        encoder (`PreTrainedModel`, *optional*):
+            The encoder model to use.
+        decoder (`PreTrainedModel`, *optional*):
+            The decoder model to use.
+        """
+        if config is None and (encoder is None or decoder is None):
+            raise ValueError("Either a configuration or an encoder and a decoder has to be provided.")
+        if config is None:
+            config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config)
+        else:
+            if not isinstance(config, self.config_class):
+                raise ValueError(f"Config: {config} has to be of type {self.config_class}")
+
+        if config.decoder.cross_attention_hidden_size is not None:
+            if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
+                raise ValueError(
+                    "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal"
+                    f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
+                    f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for"
+                    " `config.encoder.hidden_size`."
+                )
+
+        # initialize with config
+        # make sure input & output embeddings is not tied
+        config.tie_word_embeddings = False
+        super().__init__(config)
+
+        if encoder is None:
+            encoder = AutoModel.from_config(config.encoder)
+
+        if decoder is None:
+            decoder = AutoModelForCausalLM.from_config(config.decoder)
+
+        self.encoder = encoder
+        self.decoder = decoder
+        self._can_compile_fullgraph = decoder._can_compile_fullgraph
+
+        if self.encoder.config.to_dict() != self.config.encoder.to_dict():
+            logger.warning(
+                f"Config of the encoder: {self.encoder.__class__} is overwritten by shared encoder config:"
+                f" {self.config.encoder}"
+            )
+        if self.decoder.config.to_dict() != self.config.decoder.to_dict():
+            logger.warning(
+                f"Config of the decoder: {self.decoder.__class__} is overwritten by shared decoder config:"
+                f" {self.config.decoder}"
+            )
+
+        # make sure that the individual model's config refers to the shared config
+        # so that the updates to the config will be synced
+        self.config.encoder._attn_implementation = self.encoder.config._attn_implementation
+        self.config.decoder._attn_implementation = self.decoder.config._attn_implementation
+        self.encoder.config = self.config.encoder
+        self.decoder.config = self.config.decoder
+
+        # encoder outputs might need to be projected to different dimension for decoder
+        if (
+            self.encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            self.enc_to_dec_proj = nn.Linear(self.encoder.config.hidden_size, self.decoder.config.hidden_size)
+
+        if self.encoder.get_output_embeddings() is not None:
+            raise ValueError(
+                f"The encoder {self.encoder} should not have a LM Head. Please use a model without LM Head"
+            )
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_input_embeddings(self):
+        return self.decoder.get_input_embeddings()
+
+    def get_output_embeddings(self):
+        return self.decoder.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        return self.decoder.set_output_embeddings(new_embeddings)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import VisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("ydshieh/vit-gpt2-coco-en")
+        >>> decoder_tokenizer = AutoTokenizer.from_pretrained("ydshieh/vit-gpt2-coco-en")
+        >>> model = VisionEncoderDecoderModel.from_pretrained("ydshieh/vit-gpt2-coco-en")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> img = Image.open(requests.get(url, stream=True).raw)
+        >>> pixel_values = image_processor(images=img, return_tensors="pt").pixel_values  # Batch size 1
+
+        >>> output_ids = model.generate(
+        ...     pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True
+        ... ).sequences
+
+        >>> preds = decoder_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        >>> preds = [pred.strip() for pred in preds]
+
+        >>> assert preds == ["a cat laying on top of a couch next to another cat"]
+        ```"""
+
+        from_tf = kwargs.pop("from_tf", False)
+        if from_tf:
+            from transformers import TFVisionEncoderDecoderModel
+
+            # a workaround to load from tensorflow checkpoint
+            # Using `_tf_model` won't work, because the weight names in the encoder/decoder of `_tf_model` get
+            # extended before saving those components. For example, The name of `_tf_model.encoder.vit` is
+            # `[top model name]/encoder/vit`, but the name of `tf_model.encoder.vit` is `[top model name]/vit`. The
+            # [top model name] is handled (stripped) by the conversion method, and the former case gets extra `encoder`,
+            # which should not occur when we want to save the components alone.
+            # There was a (very) ugly potential fix, which wasn't integrated to `transformers`: see
+            #   https://github.com/huggingface/transformers/pull/13222/commits/dbb3c9de76eee235791d2064094654637c99f36d#r697304245
+            #   (the change in `src/transformers/modeling_tf_utils.py`)
+            _tf_model = TFVisionEncoderDecoderModel.from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+            config = _tf_model.config
+
+            # Using `tf_model` instead
+            encoder = _tf_model.encoder.__class__(_tf_model.config.encoder)
+            decoder = _tf_model.decoder.__class__(_tf_model.config.decoder)
+            # Make sure models are built
+            encoder(encoder.dummy_inputs)
+            decoder(decoder.dummy_inputs)
+
+            # Get the variable correspondence between `_tf_model` and `encoder` and `decoder`
+            encoder_variables = {}
+            for v in encoder.trainable_variables + encoder.non_trainable_variables:
+                encoder_variables["/".join(v.name.split("/")[1:])] = v
+            decoder_variables = {}
+            for v in decoder.trainable_variables + decoder.non_trainable_variables:
+                decoder_variables["/".join(v.name.split("/")[1:])] = v
+
+            _encoder_variables = {}
+            for v in _tf_model.encoder.trainable_variables + _tf_model.encoder.non_trainable_variables:
+                _encoder_variables["/".join(v.name.split("/")[2:])] = v
+            _decoder_variables = {}
+            for v in _tf_model.decoder.trainable_variables + _tf_model.decoder.non_trainable_variables:
+                _decoder_variables["/".join(v.name.split("/")[2:])] = v
+
+            # assign weight values to `encoder` and `decoder` from `_tf_model`
+            for name, v in encoder_variables.items():
+                v.assign(_encoder_variables[name])
+            for name, v in decoder_variables.items():
+                v.assign(_decoder_variables[name])
+
+            tf_model = TFVisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
+
+            # Deal with `enc_to_dec_proj`
+            if hasattr(_tf_model, "enc_to_dec_proj"):
+                tf_model(tf_model.dummy_inputs)
+                tf_model.enc_to_dec_proj.kernel.assign(_tf_model.enc_to_dec_proj.kernel)
+                tf_model.enc_to_dec_proj.bias.assign(_tf_model.enc_to_dec_proj.bias)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                encoder_dir = os.path.join(tmpdirname, "encoder")
+                decoder_dir = os.path.join(tmpdirname, "decoder")
+                tf_model.encoder.save_pretrained(encoder_dir)
+                tf_model.decoder.save_pretrained(decoder_dir)
+
+                if hasattr(tf_model, "enc_to_dec_proj"):
+                    enc_to_dec_proj_weight = torch.transpose(
+                        torch.from_numpy(tf_model.enc_to_dec_proj.kernel.numpy()), 1, 0
+                    )
+                    enc_to_dec_proj_bias = torch.from_numpy(tf_model.enc_to_dec_proj.bias.numpy())
+
+                del _tf_model
+                del tf_model
+                gc.collect()
+
+                attn_implementation = kwargs.get("attn_implementation")
+                kwargs_encoder_decoder = {}
+                if attn_implementation:
+                    kwargs_encoder_decoder = {
+                        "encoder_attn_implementation": attn_implementation,
+                        "decoder_attn_implementation": attn_implementation,
+                    }
+
+                model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+                    encoder_dir,
+                    decoder_dir,
+                    encoder_from_tf=True,
+                    decoder_from_tf=True,
+                    **kwargs_encoder_decoder,
+                )
+                # This is only for copying some specific attributes of this particular model.
+                model.config = config
+
+                if hasattr(model, "enc_to_dec_proj"):
+                    model.enc_to_dec_proj.weight.data = enc_to_dec_proj_weight.contiguous()
+                    model.enc_to_dec_proj.bias.data = enc_to_dec_proj_bias.contiguous()
+
+                return model
+
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+    @classmethod
+    def from_encoder_decoder_pretrained(
+        cls,
+        encoder_pretrained_model_name_or_path: Optional[str] = None,
+        decoder_pretrained_model_name_or_path: Optional[str] = None,
+        *model_args,
+        **kwargs,
+    ) -> PreTrainedModel:
+        r"""
+        Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
+        checkpoints.
+
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
+        the model, you need to first set it back in training mode with `model.train()`.
+
+        Params:
+            encoder_pretrained_model_name_or_path (`str`, *optional*):
+                Information necessary to initiate the image encoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. An
+                      example is `google/vit-base-patch16-224-in21k`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
+                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
+                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            decoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
+                Information necessary to initiate the text decoder. Can be either:
+
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+                      this case, `from_tf` should be set to `True` and a configuration object should be provided as
+                      `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
+                      PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args (remaining positional arguments, *optional*):
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
+
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                `output_attentions=True`).
+
+                - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter.
+                - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
+                - To update the parent model configuration, do not use a prefix for each configuration parameter.
+
+                Behaves differently depending on whether a `config` is provided or automatically loaded.
+
+        Example:
+
+        ```python
+        >>> from transformers import VisionEncoderDecoderModel
+
+        >>> # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
+        >>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "google/vit-base-patch16-224-in21k", "google-bert/bert-base-uncased"
+        ... )
+        >>> # saving model after fine-tuning
+        >>> model.save_pretrained("./vit-bert")
+        >>> # load fine-tuned model
+        >>> model = VisionEncoderDecoderModel.from_pretrained("./vit-bert")
+        ```"""
+
+        kwargs_encoder = {
+            argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
+        }
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+
+        # remove encoder, decoder kwargs from kwargs
+        for key in kwargs_encoder:
+            del kwargs["encoder_" + key]
+        for key in kwargs_decoder:
+            del kwargs["decoder_" + key]
+
+        # Load and initialize the encoder and decoder
+        # The distinction between encoder and decoder at the model level is made
+        # by the value of the flag `is_decoder` that we need to set correctly.
+        encoder = kwargs_encoder.pop("model", None)
+        if encoder is None:
+            if encoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_encoder:
+                encoder_config, kwargs_encoder = AutoConfig.from_pretrained(
+                    encoder_pretrained_model_name_or_path, **kwargs_encoder, return_unused_kwargs=True
+                )
+
+                if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
+                    logger.info(
+                        f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model "
+                        "from a decoder model. Cross-attention and causal mask are disabled."
+                    )
+                    encoder_config.is_decoder = False
+                    encoder_config.add_cross_attention = False
+
+                kwargs_encoder["config"] = encoder_config
+
+            encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
+
+        decoder = kwargs_decoder.pop("model", None)
+        if decoder is None:
+            if decoder_pretrained_model_name_or_path is None:
+                raise ValueError(
+                    "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
+                    "to be defined."
+                )
+
+            if "config" not in kwargs_decoder:
+                decoder_config, kwargs_decoder = AutoConfig.from_pretrained(
+                    decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True
+                )
+
+                if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
+                    logger.info(
+                        f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
+                        f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
+                        f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
+                    )
+                    decoder_config.is_decoder = True
+                    decoder_config.add_cross_attention = True
+
+                kwargs_decoder["config"] = decoder_config
+
+            if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
+                logger.warning(
+                    f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
+                    f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
+                    "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
+                    "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a "
+                    "`decoder_config` to `.from_encoder_decoder_pretrained(...)`"
+                )
+
+            decoder = AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
+
+        # instantiate config with corresponding kwargs
+        config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
+
+        # make sure input & output embeddings is not tied
+        config.tie_word_embeddings = False
+        return cls(encoder=encoder, decoder=decoder, config=config)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        encoder_outputs: Optional[tuple[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.FloatTensor], Seq2SeqLMOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            For training, `decoder_input_ids` are automatically created by the model by shifting the `labels` to the
+            right, replacing -100 by the `pad_token_id` and prepending them with the `decoder_start_token_id`.
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. This is useful if you want more control over how to convert `decoder_input_ids` indices
+            into associated vectors than the model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss for the decoder. Indices should be in `[-100, 0,
+            ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, VisionEncoderDecoderModel
+        >>> import requests
+        >>> from PIL import Image
+        >>> import torch
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/trocr-base-handwritten")
+        >>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
+
+        >>> # load image from the IAM dataset
+        >>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+        >>> # training
+        >>> model.config.decoder_start_token_id = processor.tokenizer.eos_token_id
+        >>> model.config.pad_token_id = processor.tokenizer.pad_token_id
+        >>> model.config.vocab_size = model.config.decoder.vocab_size
+
+        >>> pixel_values = processor(image, return_tensors="pt").pixel_values
+        >>> text = "hello world"
+        >>> labels = processor.tokenizer(text, return_tensors="pt").input_ids
+        >>> outputs = model(pixel_values=pixel_values, labels=labels)
+        >>> loss = outputs.loss
+
+        >>> # inference (generation)
+        >>> generated_ids = model.generate(pixel_values)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # num_items_in_batch is only needed for loss computation
+        num_items_in_batch = kwargs.pop("num_items_in_batch", None)
+
+        kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
+
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
+        }
+
+        if encoder_outputs is None:
+            if pixel_values is None:
+                raise ValueError("You have to specify pixel_values")
+
+            encoder_outputs = self.encoder(
+                pixel_values=pixel_values,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                **kwargs_encoder,
+            )
+        elif isinstance(encoder_outputs, tuple):
+            encoder_outputs = BaseModelOutput(*encoder_outputs)
+
+        encoder_hidden_states = encoder_outputs[0]
+
+        # optionally project encoder_hidden_states
+        if (
+            self.encoder.config.hidden_size != self.decoder.config.hidden_size
+            and self.decoder.config.cross_attention_hidden_size is None
+        ):
+            encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)
+
+        # else:
+        encoder_attention_mask = None
+
+        if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
+            decoder_input_ids = shift_tokens_right(
+                labels, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs_decoder,
+        )
+
+        # Compute loss independent from decoder (as some shift the logits inside them)
+        loss = None
+        if labels is not None:
+            logits = decoder_outputs.logits if return_dict else decoder_outputs[0]
+
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                vocab_size=self.decoder.config.vocab_size,
+                num_items_in_batch=num_items_in_batch,
+            )
+
+        if not return_dict:
+            if loss is not None:
+                return (loss,) + decoder_outputs + encoder_outputs
+            else:
+                return decoder_outputs + encoder_outputs
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=decoder_outputs.logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+
+__all__ = ["VisionEncoderDecoderModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5577c10f76f32c3339fe87ced37e1d4b84f8a260
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/configuration_visual_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/configuration_visual_bert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec826413bb66646f88ea3f41c29e25abf0ef7a51
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/configuration_visual_bert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/modeling_visual_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/modeling_visual_bert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc21fcb8d77a3c4282062ccce6e9592e81f02e04
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/visual_bert/__pycache__/modeling_visual_bert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7520263c51bcd3b63747ee0a55e1baff24a777f9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_wav2vec2_bert import *
+    from .modeling_wav2vec2_bert import *
+    from .processing_wav2vec2_bert import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__pycache__/configuration_wav2vec2_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__pycache__/configuration_wav2vec2_bert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d32005e0fc61dbe0e8b0fa3c229ebb86282902e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__pycache__/configuration_wav2vec2_bert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__pycache__/modeling_wav2vec2_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__pycache__/modeling_wav2vec2_bert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..363305004a96998e5916a2748ba3a5d5be646463
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/__pycache__/modeling_wav2vec2_bert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..2462976cfbbeb908fa3e8bbd04722f6aa066a7f8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py
@@ -0,0 +1,313 @@
+# coding=utf-8
+# Copyright 2024 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wav2Vec2Bert model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Wav2Vec2BertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Wav2Vec2BertModel`]. It is used to
+    instantiate an Wav2Vec2Bert model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Wav2Vec2Bert
+    [facebook/wav2vec2-bert-rel-pos-large](https://huggingface.co/facebook/wav2vec2-bert-rel-pos-large)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*):
+            Vocabulary size of the Wav2Vec2Bert model. Defines the number of different tokens that can be
+            represented by the `inputs_ids` passed when calling [`Wav2Vec2BertModel`]. Vocabulary size of the
+            model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward
+            method of [`Wav2Vec2BertModel`].
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        feature_projection_input_dim (`int`, *optional*, defaults to 160):
+            Input dimension of this model, i.e the dimension after processing input audios with [`SeamlessM4TFeatureExtractor`] or [`Wav2Vec2BertProcessor`].
+        hidden_act (`str` or `function`, *optional*, defaults to `"swish"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the feature projection.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`Wav2Vec2BertForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) for more
+            details.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://huggingface.co/papers/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procedure generates `mask_time_prob*len(time_axis)/mask_time_length ``independent masks over the axis. If
+            reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2):
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if `mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks`.
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procedure generates `mask_feature_prob*len(feature_axis)/mask_time_length` independent masks over
+            the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0):
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            `mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks`.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`Wav2Vec2BertForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`Wav2Vec2BertForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`Wav2Vec2BertForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the projection before token mean-pooling for classification.
+        tdnn_dim (`tuple[int]` or `list[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`tuple[int]` or `list[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`tuple[int]` or `list[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        pad_token_id (`int`, *optional*, defaults to 0): The id of the _beginning-of-stream_ token.
+        bos_token_id (`int`, *optional*, defaults to 1): The id of the _padding_ token.
+        eos_token_id (`int`, *optional*, defaults to 2): The id of the _end-of-stream_ token.
+        add_adapter (`bool`, *optional*, defaults to `False`):
+            Whether a convolutional attention network should be stacked on top of the Wav2Vec2Bert Encoder. Can be very
+            useful for warm-starting Wav2Vec2Bert for SpeechEncoderDecoder models.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 1):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        adapter_act (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the adapter layers. If string, `"gelu"`,
+            `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported.
+        use_intermediate_ffn_before_adapter (`bool`, *optional*, defaults to `False`):
+            Whether an intermediate feed-forward block should be stacked on top of the Wav2Vec2Bert Encoder and before the adapter network.
+             Only relevant if `add_adapter is True`.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.
+        position_embeddings_type (`str`, *optional*, defaults to `"relative_key"`):
+            Can be specified to :
+                - `rotary`, for rotary position embeddings.
+                - `relative`, for relative position embeddings.
+                - `relative_key`, for relative position embeddings as defined by Shaw in [Self-Attention
+            with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+            If left to `None`, no relative position embeddings is applied.
+        rotary_embedding_base (`int`, *optional*, defaults to 10000):
+            If `"rotary"` position embeddings are used, defines the size of the embedding base.
+        max_source_positions (`int`, *optional*, defaults to 5000):
+            if `"relative"` position embeddings are used, defines the maximum source input positions.
+        left_max_position_embeddings (`int`, *optional*, defaults to 64):
+            If `"relative_key"` (aka Shaw) position embeddings are used, defines the left clipping value for relative positions.
+        right_max_position_embeddings (`int`, *optional*, defaults to 8):
+            If `"relative_key"` (aka Shaw) position embeddings are used, defines the right clipping value for relative positions.
+        conv_depthwise_kernel_size (`int`, *optional*, defaults to 31):
+            Kernel size of convolutional depthwise 1D layer in Conformer blocks.
+        conformer_conv_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all convolutional layers in Conformer blocks.
+    Example:
+
+    ```python
+    >>> from transformers import Wav2Vec2BertConfig, Wav2Vec2BertModel
+
+    >>> # Initializing a Wav2Vec2Bert facebook/wav2vec2-bert-rel-pos-large style configuration
+    >>> configuration = Wav2Vec2BertConfig()
+
+    >>> # Initializing a model (with random weights) from the facebook/wav2vec2-bert-rel-pos-large style configuration
+    >>> model = Wav2Vec2BertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "wav2vec2-bert"
+
+    def __init__(
+        self,
+        vocab_size=None,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        intermediate_size=4096,
+        feature_projection_input_dim=160,
+        hidden_act="swish",
+        hidden_dropout=0.0,
+        activation_dropout=0.0,
+        attention_dropout=0.0,
+        feat_proj_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        apply_spec_augment=True,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        ctc_loss_reduction="sum",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=768,
+        tdnn_dim=(512, 512, 512, 512, 1500),
+        tdnn_kernel=(5, 3, 3, 1, 1),
+        tdnn_dilation=(1, 2, 3, 1, 1),
+        xvector_output_dim=512,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        add_adapter=False,
+        adapter_kernel_size=3,
+        adapter_stride=2,
+        num_adapter_layers=1,
+        adapter_act="relu",
+        use_intermediate_ffn_before_adapter=False,
+        output_hidden_size=None,
+        position_embeddings_type="relative_key",
+        rotary_embedding_base=10000,
+        max_source_positions=5000,
+        left_max_position_embeddings=64,
+        right_max_position_embeddings=8,
+        conv_depthwise_kernel_size=31,
+        conformer_conv_dropout=0.1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.feature_projection_input_dim = feature_projection_input_dim
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+        self.max_source_positions = max_source_positions
+
+        if position_embeddings_type is not None and position_embeddings_type not in [
+            "rotary",
+            "relative",
+            "relative_key",
+        ]:
+            raise ValueError(
+                """
+                `position_embeddings_type` is not valid. It must be one of the following values:
+                `["rotary", "relative", "relative_key"]` or left as `None`.
+                """
+            )
+        self.position_embeddings_type = position_embeddings_type
+        self.rotary_embedding_base = rotary_embedding_base
+        self.left_max_position_embeddings = left_max_position_embeddings
+        self.right_max_position_embeddings = right_max_position_embeddings
+
+        # Conformer-block related
+        self.conv_depthwise_kernel_size = conv_depthwise_kernel_size
+        self.conformer_conv_dropout = conformer_conv_dropout
+
+        # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+        # adapter
+        self.add_adapter = add_adapter
+        self.adapter_kernel_size = adapter_kernel_size
+        self.adapter_stride = adapter_stride
+        self.num_adapter_layers = num_adapter_layers
+        self.adapter_act = adapter_act
+        self.output_hidden_size = output_hidden_size if output_hidden_size is not None else hidden_size
+        if use_intermediate_ffn_before_adapter and not add_adapter:
+            raise ValueError("`use_intermediate_ffn_before_adapter` is `True` but `add_adapter` is `False`.")
+        self.use_intermediate_ffn_before_adapter = use_intermediate_ffn_before_adapter
+
+        # SequenceClassification-specific parameter. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+
+        # XVector-specific parameters. Feel free to ignore for other classes.
+        self.tdnn_dim = list(tdnn_dim)
+        self.tdnn_kernel = list(tdnn_kernel)
+        self.tdnn_dilation = list(tdnn_dilation)
+        self.xvector_output_dim = xvector_output_dim
+
+    @property
+    def inputs_to_logits_ratio(self):
+        ratio = self.feature_projection_input_dim * 2
+        if self.add_adapter:
+            ratio = ratio * (self.adapter_stride**self.num_adapter_layers)
+        return ratio
+
+
+__all__ = ["Wav2Vec2BertConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..3448089c632bbc5515f0b1c75da424ed1cc23173
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py
@@ -0,0 +1,1515 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_wav2vec2_bert.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import math
+import warnings
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, is_peft_available
+from .configuration_wav2vec2_bert import Wav2Vec2BertConfig
+
+
+class Wav2Vec2BertRotaryPositionalEmbedding(nn.Module):
+    """Rotary positional embedding
+    Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://huggingface.co/papers/2104.09864
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        dim = config.hidden_size // config.num_attention_heads
+        base = config.rotary_embedding_base
+
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        # Ignore copy
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.cached_sequence_length = None
+        self.cached_rotary_positional_embedding = None
+
+    def forward(self, hidden_states):
+        sequence_length = hidden_states.shape[1]
+
+        if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
+            return self.cached_rotary_positional_embedding
+
+        self.cached_sequence_length = sequence_length
+        # Embeddings are computed in the dtype of the inv_freq constant
+        time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
+        freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
+        embeddings = torch.cat((freqs, freqs), dim=-1)
+
+        cos_embeddings = embeddings.cos()[:, None, None, :]
+        sin_embeddings = embeddings.sin()[:, None, None, :]
+        # Computed embeddings are cast to the dtype of the hidden state inputs
+        self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states)
+        return self.cached_rotary_positional_embedding
+
+
+class Wav2Vec2BertRelPositionalEmbedding(nn.Module):
+    """Relative positional encoding module."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.max_len = config.max_source_positions
+        self.d_model = config.hidden_size
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
+
+    def extend_pe(self, x):
+        # Reset the positional encodings
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` is the position of query vector and `j` is the
+        # position of key vector. We use positive relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+
+        # Reverse the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://huggingface.co/papers/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, hidden_states: torch.Tensor):
+        self.extend_pe(hidden_states)
+        start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
+        end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
+        relative_position_embeddings = self.pe[:, start_idx:end_idx]
+
+        return relative_position_embeddings
+
+
+class Wav2Vec2BertFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+class Wav2Vec2BertFeedForward(nn.Module):
+    def __init__(self, config, act_fn=None, hidden_size=None):
+        super().__init__()
+        act_fn = act_fn if act_fn is not None else config.hidden_act
+        hidden_size = hidden_size if hidden_size is not None else config.hidden_size
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn
+
+        self.output_dense = nn.Linear(config.intermediate_size, hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2BertConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block"""
+
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pointwise_conv1 = nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=0,
+            groups=config.hidden_size,
+            bias=False,
+        )
+
+        self.depthwise_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.activation = ACT2FN[config.hidden_act]
+        self.pointwise_conv2 = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(config.conformer_conv_dropout)
+
+    def forward(self, hidden_states, attention_mask=None):
+        hidden_states = self.layer_norm(hidden_states)
+
+        # Ensure that we do not leak padded positions in depthwise convolution if attention mask is passed.
+        # Put 0 where necessary
+        if attention_mask is not None:
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+
+        # Pad the sequence entirely on the left because of causal convolution.
+        hidden_states = torch.nn.functional.pad(hidden_states, (self.depthwise_conv.kernel_size[0] - 1, 0))
+
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+
+        hidden_states = self.depthwise_layer_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2BertSelfAttention(nn.Module):
+    """Construct an Wav2Vec2BertSelfAttention object.
+    Can be enhanced with rotary or relative position embeddings.
+    """
+
+    def __init__(self, config, is_adapter_attention=False):
+        super().__init__()
+        hidden_size = config.hidden_size if not is_adapter_attention else config.output_hidden_size
+
+        self.head_size = hidden_size // config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type if not is_adapter_attention else None
+
+        self.linear_q = nn.Linear(hidden_size, hidden_size)
+        self.linear_k = nn.Linear(hidden_size, hidden_size)
+        self.linear_v = nn.Linear(hidden_size, hidden_size)
+        self.linear_out = nn.Linear(hidden_size, hidden_size)
+
+        self.dropout = nn.Dropout(p=config.attention_dropout)
+
+        if self.position_embeddings_type == "relative":
+            # linear transformation for positional encoding
+            self.linear_pos = nn.Linear(hidden_size, hidden_size, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://huggingface.co/papers/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+
+        if self.position_embeddings_type == "relative_key":
+            self.left_max_position_embeddings = config.left_max_position_embeddings
+            self.right_max_position_embeddings = config.right_max_position_embeddings
+            num_positions = self.left_max_position_embeddings + self.right_max_position_embeddings + 1
+            self.distance_embedding = nn.Embedding(num_positions, self.head_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+
+        if self.position_embeddings_type == "rotary":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
+                )
+            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
+
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        if self.position_embeddings_type == "relative":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type =="
+                    " 'relative'"
+                )
+            # apply relative_position_embeddings to qk scores
+            # as proposed in Transformer_XL: https://huggingface.co/papers/1901.02860
+            scores = self._apply_relative_embeddings(
+                query=query, key=key, relative_position_embeddings=relative_position_embeddings
+            )
+        else:
+            scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+
+        if self.position_embeddings_type == "relative_key":
+            query_length, key_length = query.shape[2], key.shape[2]
+
+            position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_r - position_ids_l
+            distance = torch.clamp(distance, -self.left_max_position_embeddings, self.right_max_position_embeddings)
+
+            positional_embedding = self.distance_embedding(distance + self.left_max_position_embeddings)
+            positional_embedding = positional_embedding.to(dtype=query.dtype)  # fp16 compatibility
+
+            relative_position_attn_weights = torch.einsum("bhld,lrd->bhlr", query, positional_embedding)
+            scores = scores + (relative_position_attn_weights / math.sqrt(self.head_size))
+
+        # apply attention_mask if necessary
+        if attention_mask is not None:
+            scores = scores + attention_mask
+
+        # => (batch, head, time1, time2)
+        probs = torch.softmax(scores, dim=-1)
+        probs = self.dropout(probs)
+
+        # => (batch, head, time1, d_k)
+        hidden_states = torch.matmul(probs, value)
+
+        # => (batch, time1, hidden_size)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        hidden_states = self.linear_out(hidden_states)
+
+        return hidden_states, probs
+
+    def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
+
+        cos = relative_position_embeddings[0, :sequence_length, ...]
+        sin = relative_position_embeddings[1, :sequence_length, ...]
+
+        # rotate hidden_states with rotary embeddings
+        hidden_states = hidden_states.transpose(0, 1)
+        rotated_states_begin = hidden_states[..., : self.head_size // 2]
+        rotated_states_end = hidden_states[..., self.head_size // 2 :]
+        rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
+        hidden_states = (hidden_states * cos) + (rotated_states * sin)
+        hidden_states = hidden_states.transpose(0, 1)
+
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)
+
+        return hidden_states
+
+    def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
+        # 1. project positional embeddings
+        # => (batch, head, 2*time1-1, d_k)
+        proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.view(
+            relative_position_embeddings.size(0), -1, self.num_heads, self.head_size
+        )
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)
+
+        # 2. Add bias to query
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
+        q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)
+
+        # 3. attention score: first compute matrix a and matrix c
+        # as described in https://huggingface.co/papers/1901.02860 Section 3.3
+        # => (batch, head, time1, time2)
+        scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))
+
+        # 4. then compute matrix b and matrix d
+        # => (batch, head, time1, 2*time1-1)
+        scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)
+
+        # 5. shift matrix b and matrix d
+        zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
+        scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
+        scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
+        scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
+        scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
+        scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1]
+
+        # 6. sum matrices
+        # => (batch, head, time1, time2)
+        scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)
+
+        return scores
+
+
+class Wav2Vec2BertEncoderLayer(GradientCheckpointingLayer):
+    """Conformer block based on https://huggingface.co/papers/2005.08100."""
+
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.attention_dropout
+
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.ffn1 = Wav2Vec2BertFeedForward(config)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.self_attn_dropout = nn.Dropout(dropout)
+        self.self_attn = Wav2Vec2BertSelfAttention(config)
+
+        # Conformer Convolution
+        self.conv_module = Wav2Vec2BertConvolutionModule(config)
+
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.ffn2 = Wav2Vec2BertFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        conv_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weigts = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            relative_position_embeddings=relative_position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states, attention_mask=conv_attention_mask)
+        hidden_states = residual + hidden_states
+
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states, attn_weigts
+
+
+class Wav2Vec2BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        if config.position_embeddings_type == "relative":
+            self.embed_positions = Wav2Vec2BertRelPositionalEmbedding(config)
+        elif config.position_embeddings_type == "rotary":
+            self.embed_positions = Wav2Vec2BertRotaryPositionalEmbedding(config)
+        else:
+            self.embed_positions = None
+
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2BertEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        conv_attention_mask = attention_mask
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        hidden_states = self.dropout(hidden_states)
+
+        if self.embed_positions is not None:
+            relative_position_embeddings = self.embed_positions(hidden_states)
+        else:
+            relative_position_embeddings = None
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and dropout_probability < self.config.layerdrop
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                layer_outputs = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    relative_position_embeddings=relative_position_embeddings,
+                    output_attentions=output_attentions,
+                    conv_attention_mask=conv_attention_mask,
+                )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2BertAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size, eps=config.layer_norm_eps)
+        else:
+            self.proj = self.proj_layer_norm = None
+        self.layers = nn.ModuleList(Wav2Vec2BertAdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+
+        self.kernel_size = config.adapter_kernel_size
+        self.stride = config.adapter_stride
+
+    def _compute_sub_sample_lengths_from_attention_mask(self, seq_lens):
+        if seq_lens is None:
+            return seq_lens
+        pad = self.kernel_size // 2
+        seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1
+        return seq_lens.floor()
+
+    def forward(self, hidden_states, attention_mask=None):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+
+        sub_sampled_lengths = None
+        if attention_mask is not None:
+            sub_sampled_lengths = (attention_mask.size(1) - (1 - attention_mask.int()).sum(1)).to(hidden_states.device)
+
+        for layer in self.layers:
+            layerdrop_prob = torch.rand([])
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(sub_sampled_lengths)
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(
+                    hidden_states, attention_mask=attention_mask, sub_sampled_lengths=sub_sampled_lengths
+                )
+
+        return hidden_states
+
+
+# Copied from transformers.models.seamless_m4t_v2.modeling_seamless_m4t_v2._compute_new_attention_mask
+def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
+    """
+    Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that
+    stops at the corresponding element in `seq_lens`.
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`):
+            The sequences to mask, where `*` is any number of sequence-specific dimensions including none.
+        seq_lens (`torch.Tensor` of shape `(batch)`:
+            Each element represents the length of the sequence at the same index in `hidden_states`
+    Returns:
+        `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)`
+    """
+    batch_size, mask_seq_len = hidden_states.shape[:2]
+
+    indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
+
+    bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
+
+    mask = hidden_states.new_ones((batch_size, mask_seq_len))
+
+    mask = mask.masked_fill(bool_mask, 0)
+
+    return mask
+
+
+class Wav2Vec2BertAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.output_hidden_size
+        dropout = config.conformer_conv_dropout
+
+        self.kernel_size = config.adapter_kernel_size
+        self.stride = config.adapter_stride
+
+        # 1. residual convolution
+        self.residual_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.residual_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.activation = nn.GLU(dim=1)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.self_attn_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.self_attn = Wav2Vec2BertSelfAttention(config, is_adapter_attention=True)
+        self.self_attn_dropout = nn.Dropout(dropout)
+
+        # Feed-forward
+        self.ffn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.ffn = Wav2Vec2BertFeedForward(config, act_fn=config.adapter_act, hidden_size=embed_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        sub_sampled_lengths: Optional[torch.Tensor] = None,
+    ):
+        residual = self.residual_layer_norm(hidden_states)
+
+        # Apply pooling to the residual to match the sequence length of the
+        # multi-head attention output.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        residual = residual.transpose(1, 2)
+        residual = self.residual_conv(residual)
+        residual = self.activation(residual)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        residual = residual.transpose(1, 2)
+
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Apply pooling before feeding to the multihead-attention layer.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.self_attn_conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        if attention_mask is not None:
+            attention_mask = _compute_new_attention_mask(hidden_states=hidden_states, seq_lens=sub_sampled_lengths)
+            attention_mask = _prepare_4d_attention_mask(
+                attention_mask,
+                hidden_states.dtype,
+            )
+
+        # The rest of the computation is identical to a vanilla Transformer
+        # encoder layer.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+        hidden_states = self.ffn(hidden_states) + residual
+
+        return hidden_states
+
+
+@auto_docstring
+class Wav2Vec2BertPreTrainedModel(PreTrainedModel):
+    config: Wav2Vec2BertConfig
+    base_model_prefix = "wav2vec2_bert"
+    main_input_name = "input_features"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, Wav2Vec2BertSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, Wav2Vec2BertFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+        elif isinstance(module, Wav2Vec2BertModel):
+            if hasattr(module, "masked_spec_embed"):
+                module.masked_spec_embed.data.uniform_()
+        elif isinstance(
+            module,
+            (Wav2Vec2BertForSequenceClassification, Wav2Vec2BertForAudioFrameClassification, Wav2Vec2BertForXVector),
+        ):
+            if hasattr(module, "layer_weights"):
+                module.layer_weights.data.fill_(1.0 / (self.config.num_hidden_layers + 1))
+        elif isinstance(module, AMSoftmaxLoss):  # noqa: F821
+            module.weight.data.normal_()
+
+    # Ignore copy
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride, padding):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length + 2 * padding - kernel_size, stride, rounding_mode="floor") + 1
+
+        if add_adapter:
+            padding = self.config.adapter_kernel_size // 2
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(
+                    input_lengths, self.config.adapter_kernel_size, self.config.adapter_stride, padding
+                )
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+def _compute_mask_indices(
+    shape: tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+Wav2Vec2BertBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@auto_docstring
+class Wav2Vec2BertModel(Wav2Vec2BertPreTrainedModel):
+    def __init__(self, config: Wav2Vec2BertConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_projection = Wav2Vec2BertFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        self.encoder = Wav2Vec2BertEncoder(config)
+
+        self.adapter = Wav2Vec2BertAdapter(config) if config.add_adapter else None
+
+        self.intermediate_ffn = None
+        if config.use_intermediate_ffn_before_adapter:
+            self.intermediate_ffn = Wav2Vec2BertFeedForward(config, act_fn="relu")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://huggingface.co/papers/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+
+        return hidden_states
+
+    @auto_docstring
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, Wav2Vec2BertBaseModelOutput]:
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states, extract_features = self.feature_projection(input_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.intermediate_ffn:
+            expanded_hidden_states = self.intermediate_ffn(hidden_states)
+            hidden_states = hidden_states + 0.5 * expanded_hidden_states
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states, attention_mask=attention_mask)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return Wav2Vec2BertBaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+
+@auto_docstring(
+    custom_intro="""
+    Wav2Vec2Bert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
+    """
+)
+class Wav2Vec2BertForCTC(Wav2Vec2BertPreTrainedModel):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        r"""
+        target_lang (`str`, *optional*):
+            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
+            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechSatForCTC`] with adapters. Uses 'eng' by
+            default.
+        """
+        super().__init__(config)
+
+        self.wav2vec2_bert = Wav2Vec2BertModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        self.target_lang = target_lang
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Wav2Vec2BertForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask
+                if attention_mask is not None
+                else torch.ones(input_features.shape[:2], device=input_features.device, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum([-1])).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Wav2Vec2Bert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
+    """
+)
+class Wav2Vec2BertForSequenceClassification(Wav2Vec2BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Sequence classification does not support the use of Wav2Vec2Bert adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2_bert = Wav2Vec2BertModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_bert.parameters():
+            param.requires_grad = False
+
+    @auto_docstring
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class Wav2Vec2BertForAudioFrameClassification(Wav2Vec2BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Audio frame classification does not support the use of Wav2Vec2Bert adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2_bert = Wav2Vec2BertModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.num_labels = config.num_labels
+
+        self.init_weights()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_bert.parameters():
+            param.requires_grad = False
+
+    @auto_docstring
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class AMSoftmaxLoss(nn.Module):
+    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
+        super().__init__()
+        self.scale = scale
+        self.margin = margin
+        self.num_labels = num_labels
+        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, hidden_states, labels):
+        labels = labels.flatten()
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
+        cos_theta = torch.mm(hidden_states, weight)
+        psi = cos_theta - self.margin
+
+        onehot = nn.functional.one_hot(labels, self.num_labels)
+        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
+        loss = self.loss(logits, labels)
+
+        return loss
+
+
+class TDNNLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
+        self.out_conv_dim = config.tdnn_dim[layer_id]
+        self.kernel_size = config.tdnn_kernel[layer_id]
+        self.dilation = config.tdnn_dilation[layer_id]
+
+        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
+        self.activation = nn.ReLU()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if is_peft_available():
+            from peft.tuners.lora import LoraLayer
+
+        if is_peft_available():
+            if isinstance(self.kernel, LoraLayer):
+                warnings.warn(
+                    "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
+                    "You should exclude TDNNLayer from LoRA's target modules.",
+                )
+
+        # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
+        hidden_states = hidden_states.transpose(1, 2)
+        weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)
+        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    Wav2Vec2Bert Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """
+)
+class Wav2Vec2BertForXVector(Wav2Vec2BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.wav2vec2_bert = Wav2Vec2BertModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
+
+        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
+        self.tdnn = nn.ModuleList(tdnn_layers)
+
+        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
+        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
+
+        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
+
+        self.init_weights()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_bert.parameters():
+            param.requires_grad = False
+
+    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the TDNN layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size in self.config.tdnn_kernel:
+            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
+
+        return input_lengths
+
+    @auto_docstring
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, XVectorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "Wav2Vec2BertForAudioFrameClassification",
+    "Wav2Vec2BertForCTC",
+    "Wav2Vec2BertForSequenceClassification",
+    "Wav2Vec2BertForXVector",
+    "Wav2Vec2BertModel",
+    "Wav2Vec2BertPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..79f70da7cb840ebd4c7db0171c80460b58ca4108
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py
@@ -0,0 +1,1068 @@
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from ..wav2vec2.modeling_wav2vec2 import Wav2Vec2FeedForward, Wav2Vec2ForSequenceClassification, Wav2Vec2Model
+from ..wav2vec2_conformer.modeling_wav2vec2_conformer import (
+    Wav2Vec2ConformerForAudioFrameClassification,
+    Wav2Vec2ConformerForCTC,
+    Wav2Vec2ConformerForXVector,
+    Wav2Vec2ConformerRelPositionalEmbedding,
+    Wav2Vec2ConformerRotaryPositionalEmbedding,
+    Wav2Vec2ConformerSelfAttention,
+)
+from .configuration_wav2vec2_bert import Wav2Vec2BertConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+
+# Copied from transformers.models.seamless_m4t_v2.modeling_seamless_m4t_v2._compute_new_attention_mask
+def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
+    """
+    Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that
+    stops at the corresponding element in `seq_lens`.
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`):
+            The sequences to mask, where `*` is any number of sequence-specific dimensions including none.
+        seq_lens (`torch.Tensor` of shape `(batch)`:
+            Each element represents the length of the sequence at the same index in `hidden_states`
+    Returns:
+        `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)`
+    """
+    batch_size, mask_seq_len = hidden_states.shape[:2]
+
+    indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
+
+    bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
+
+    mask = hidden_states.new_ones((batch_size, mask_seq_len))
+
+    mask = mask.masked_fill(bool_mask, 0)
+
+    return mask
+
+
+class Wav2Vec2BertRotaryPositionalEmbedding(Wav2Vec2ConformerRotaryPositionalEmbedding):
+    def __init__(self, config):
+        nn.Module.__init__(self)
+        dim = config.hidden_size // config.num_attention_heads
+        base = config.rotary_embedding_base
+
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        # Ignore copy
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.cached_sequence_length = None
+        self.cached_rotary_positional_embedding = None
+
+
+class Wav2Vec2BertRelPositionalEmbedding(Wav2Vec2ConformerRelPositionalEmbedding):
+    pass
+
+
+class Wav2Vec2BertFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+class Wav2Vec2BertFeedForward(Wav2Vec2FeedForward):
+    def __init__(self, config, act_fn=None, hidden_size=None):
+        nn.Module.__init__(self)
+        act_fn = act_fn if act_fn is not None else config.hidden_act
+        hidden_size = hidden_size if hidden_size is not None else config.hidden_size
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn
+
+        self.output_dense = nn.Linear(config.intermediate_size, hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+
+class Wav2Vec2BertConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block"""
+
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pointwise_conv1 = nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=0,
+            groups=config.hidden_size,
+            bias=False,
+        )
+
+        self.depthwise_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.activation = ACT2FN[config.hidden_act]
+        self.pointwise_conv2 = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(config.conformer_conv_dropout)
+
+    def forward(self, hidden_states, attention_mask=None):
+        hidden_states = self.layer_norm(hidden_states)
+
+        # Ensure that we do not leak padded positions in depthwise convolution if attention mask is passed.
+        # Put 0 where necessary
+        if attention_mask is not None:
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+
+        # Pad the sequence entirely on the left because of causal convolution.
+        hidden_states = torch.nn.functional.pad(hidden_states, (self.depthwise_conv.kernel_size[0] - 1, 0))
+
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+
+        hidden_states = self.depthwise_layer_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2BertSelfAttention(Wav2Vec2ConformerSelfAttention, nn.Module):
+    """Construct an Wav2Vec2BertSelfAttention object.
+    Can be enhanced with rotary or relative position embeddings.
+    """
+
+    def __init__(self, config, is_adapter_attention=False):
+        nn.Module.__init__(self)
+        hidden_size = config.hidden_size if not is_adapter_attention else config.output_hidden_size
+
+        self.head_size = hidden_size // config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type if not is_adapter_attention else None
+
+        self.linear_q = nn.Linear(hidden_size, hidden_size)
+        self.linear_k = nn.Linear(hidden_size, hidden_size)
+        self.linear_v = nn.Linear(hidden_size, hidden_size)
+        self.linear_out = nn.Linear(hidden_size, hidden_size)
+
+        self.dropout = nn.Dropout(p=config.attention_dropout)
+
+        if self.position_embeddings_type == "relative":
+            # linear transformation for positional encoding
+            self.linear_pos = nn.Linear(hidden_size, hidden_size, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://huggingface.co/papers/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+
+        if self.position_embeddings_type == "relative_key":
+            self.left_max_position_embeddings = config.left_max_position_embeddings
+            self.right_max_position_embeddings = config.right_max_position_embeddings
+            num_positions = self.left_max_position_embeddings + self.right_max_position_embeddings + 1
+            self.distance_embedding = nn.Embedding(num_positions, self.head_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+
+        if self.position_embeddings_type == "rotary":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
+                )
+            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
+
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        if self.position_embeddings_type == "relative":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type =="
+                    " 'relative'"
+                )
+            # apply relative_position_embeddings to qk scores
+            # as proposed in Transformer_XL: https://huggingface.co/papers/1901.02860
+            scores = self._apply_relative_embeddings(
+                query=query, key=key, relative_position_embeddings=relative_position_embeddings
+            )
+        else:
+            scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+
+        if self.position_embeddings_type == "relative_key":
+            query_length, key_length = query.shape[2], key.shape[2]
+
+            position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_r - position_ids_l
+            distance = torch.clamp(distance, -self.left_max_position_embeddings, self.right_max_position_embeddings)
+
+            positional_embedding = self.distance_embedding(distance + self.left_max_position_embeddings)
+            positional_embedding = positional_embedding.to(dtype=query.dtype)  # fp16 compatibility
+
+            relative_position_attn_weights = torch.einsum("bhld,lrd->bhlr", query, positional_embedding)
+            scores = scores + (relative_position_attn_weights / math.sqrt(self.head_size))
+
+        # apply attention_mask if necessary
+        if attention_mask is not None:
+            scores = scores + attention_mask
+
+        # => (batch, head, time1, time2)
+        probs = torch.softmax(scores, dim=-1)
+        probs = self.dropout(probs)
+
+        # => (batch, head, time1, d_k)
+        hidden_states = torch.matmul(probs, value)
+
+        # => (batch, time1, hidden_size)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        hidden_states = self.linear_out(hidden_states)
+
+        return hidden_states, probs
+
+
+class Wav2Vec2BertEncoderLayer(GradientCheckpointingLayer):
+    """Conformer block based on https://huggingface.co/papers/2005.08100."""
+
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.attention_dropout
+
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.ffn1 = Wav2Vec2BertFeedForward(config)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.self_attn_dropout = nn.Dropout(dropout)
+        self.self_attn = Wav2Vec2BertSelfAttention(config)
+
+        # Conformer Convolution
+        self.conv_module = Wav2Vec2BertConvolutionModule(config)
+
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.ffn2 = Wav2Vec2BertFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        conv_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weigts = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            relative_position_embeddings=relative_position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states, attention_mask=conv_attention_mask)
+        hidden_states = residual + hidden_states
+
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states, attn_weigts
+
+
+class Wav2Vec2BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        if config.position_embeddings_type == "relative":
+            self.embed_positions = Wav2Vec2BertRelPositionalEmbedding(config)
+        elif config.position_embeddings_type == "rotary":
+            self.embed_positions = Wav2Vec2BertRotaryPositionalEmbedding(config)
+        else:
+            self.embed_positions = None
+
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2BertEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        conv_attention_mask = attention_mask
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        hidden_states = self.dropout(hidden_states)
+
+        if self.embed_positions is not None:
+            relative_position_embeddings = self.embed_positions(hidden_states)
+        else:
+            relative_position_embeddings = None
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and dropout_probability < self.config.layerdrop
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                layer_outputs = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    relative_position_embeddings=relative_position_embeddings,
+                    output_attentions=output_attentions,
+                    conv_attention_mask=conv_attention_mask,
+                )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2BertAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size, eps=config.layer_norm_eps)
+        else:
+            self.proj = self.proj_layer_norm = None
+        self.layers = nn.ModuleList(Wav2Vec2BertAdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+
+        self.kernel_size = config.adapter_kernel_size
+        self.stride = config.adapter_stride
+
+    def _compute_sub_sample_lengths_from_attention_mask(self, seq_lens):
+        if seq_lens is None:
+            return seq_lens
+        pad = self.kernel_size // 2
+        seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1
+        return seq_lens.floor()
+
+    def forward(self, hidden_states, attention_mask=None):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+
+        sub_sampled_lengths = None
+        if attention_mask is not None:
+            sub_sampled_lengths = (attention_mask.size(1) - (1 - attention_mask.int()).sum(1)).to(hidden_states.device)
+
+        for layer in self.layers:
+            layerdrop_prob = torch.rand([])
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(sub_sampled_lengths)
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(
+                    hidden_states, attention_mask=attention_mask, sub_sampled_lengths=sub_sampled_lengths
+                )
+
+        return hidden_states
+
+
+class Wav2Vec2BertAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.output_hidden_size
+        dropout = config.conformer_conv_dropout
+
+        self.kernel_size = config.adapter_kernel_size
+        self.stride = config.adapter_stride
+
+        # 1. residual convolution
+        self.residual_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.residual_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.activation = nn.GLU(dim=1)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.self_attn_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.self_attn = Wav2Vec2BertSelfAttention(config, is_adapter_attention=True)
+        self.self_attn_dropout = nn.Dropout(dropout)
+
+        # Feed-forward
+        self.ffn_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.ffn = Wav2Vec2BertFeedForward(config, act_fn=config.adapter_act, hidden_size=embed_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        sub_sampled_lengths: Optional[torch.Tensor] = None,
+    ):
+        residual = self.residual_layer_norm(hidden_states)
+
+        # Apply pooling to the residual to match the sequence length of the
+        # multi-head attention output.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        residual = residual.transpose(1, 2)
+        residual = self.residual_conv(residual)
+        residual = self.activation(residual)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        residual = residual.transpose(1, 2)
+
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Apply pooling before feeding to the multihead-attention layer.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.self_attn_conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        if attention_mask is not None:
+            attention_mask = _compute_new_attention_mask(hidden_states=hidden_states, seq_lens=sub_sampled_lengths)
+            attention_mask = _prepare_4d_attention_mask(
+                attention_mask,
+                hidden_states.dtype,
+            )
+
+        # The rest of the computation is identical to a vanilla Transformer
+        # encoder layer.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+        hidden_states = self.ffn(hidden_states) + residual
+
+        return hidden_states
+
+
+@auto_docstring
+class Wav2Vec2BertPreTrainedModel(PreTrainedModel):
+    config: Wav2Vec2BertConfig
+    base_model_prefix = "wav2vec2_bert"
+    main_input_name = "input_features"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, Wav2Vec2BertSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, Wav2Vec2BertFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+        elif isinstance(module, Wav2Vec2BertModel):
+            if hasattr(module, "masked_spec_embed"):
+                module.masked_spec_embed.data.uniform_()
+        elif isinstance(
+            module,
+            (Wav2Vec2BertForSequenceClassification, Wav2Vec2BertForAudioFrameClassification, Wav2Vec2BertForXVector),
+        ):
+            if hasattr(module, "layer_weights"):
+                module.layer_weights.data.fill_(1.0 / (self.config.num_hidden_layers + 1))
+        elif isinstance(module, AMSoftmaxLoss):  # noqa: F821
+            module.weight.data.normal_()
+
+    # Ignore copy
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride, padding):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length + 2 * padding - kernel_size, stride, rounding_mode="floor") + 1
+
+        if add_adapter:
+            padding = self.config.adapter_kernel_size // 2
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(
+                    input_lengths, self.config.adapter_kernel_size, self.config.adapter_stride, padding
+                )
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+Wav2Vec2BertBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+class Wav2Vec2BertModel(Wav2Vec2Model, Wav2Vec2BertPreTrainedModel):
+    def __init__(self, config: Wav2Vec2BertConfig):
+        Wav2Vec2BertPreTrainedModel.__init__(self, config)
+        self.config = config
+        self.feature_projection = Wav2Vec2BertFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        self.encoder = Wav2Vec2BertEncoder(config)
+
+        self.adapter = Wav2Vec2BertAdapter(config) if config.add_adapter else None
+
+        self.intermediate_ffn = None
+        if config.use_intermediate_ffn_before_adapter:
+            self.intermediate_ffn = Wav2Vec2BertFeedForward(config, act_fn="relu")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, Wav2Vec2BertBaseModelOutput]:
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states, extract_features = self.feature_projection(input_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.intermediate_ffn:
+            expanded_hidden_states = self.intermediate_ffn(hidden_states)
+            hidden_states = hidden_states + 0.5 * expanded_hidden_states
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states, attention_mask=attention_mask)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return Wav2Vec2BertBaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class Wav2Vec2BertForCTC(Wav2Vec2ConformerForCTC):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        r"""
+        target_lang (`str`, *optional*):
+            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
+            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechSatForCTC`] with adapters. Uses 'eng' by
+            default.
+        """
+        super().__init__(config)
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask
+                if attention_mask is not None
+                else torch.ones(input_features.shape[:2], device=input_features.device, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum([-1])).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+class Wav2Vec2BertForSequenceClassification(Wav2Vec2ForSequenceClassification):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_bert.parameters():
+            param.requires_grad = False
+
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Wav2Vec2BertForAudioFrameClassification(Wav2Vec2ConformerForAudioFrameClassification):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Wav2Vec2BertForXVector(Wav2Vec2ConformerForXVector):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_encoder(self):
+        raise AttributeError("Not needed for Wav2Vec2Bert")
+
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, XVectorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_bert(
+            input_features,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "Wav2Vec2BertForAudioFrameClassification",
+    "Wav2Vec2BertForCTC",
+    "Wav2Vec2BertForSequenceClassification",
+    "Wav2Vec2BertForXVector",
+    "Wav2Vec2BertModel",
+    "Wav2Vec2BertPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..ead53edb101a749cb4e98f90ef16f65912edc93f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
@@ -0,0 +1,155 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Speech processor class for Wav2Vec2-BERT
+"""
+
+import warnings
+from typing import Optional, Union
+
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput
+from ..seamless_m4t.feature_extraction_seamless_m4t import SeamlessM4TFeatureExtractor
+from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
+
+
+class Wav2Vec2BertProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {}
+
+
+class Wav2Vec2BertProcessor(ProcessorMixin):
+    r"""
+    Constructs a Wav2Vec2-BERT processor which wraps a Wav2Vec2-BERT feature extractor and a Wav2Vec2 CTC tokenizer into a single
+    processor.
+
+    [`Wav2Vec2Processor`] offers all the functionalities of [`SeamlessM4TFeatureExtractor`] and [`PreTrainedTokenizer`].
+    See the docstring of [`~Wav2Vec2Processor.__call__`] and [`~Wav2Vec2Processor.decode`] for more information.
+
+    Args:
+        feature_extractor (`SeamlessM4TFeatureExtractor`):
+            An instance of [`SeamlessM4TFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer ([`PreTrainedTokenizer`]):
+            An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
+    """
+
+    feature_extractor_class = "SeamlessM4TFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        try:
+            return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
+        except OSError:
+            warnings.warn(
+                f"Loading a tokenizer inside {cls.__name__} from a config that does not"
+                " include a `tokenizer_class` attribute is deprecated and will be "
+                "removed in v5. Please add `'tokenizer_class': 'Wav2Vec2CTCTokenizer'`"
+                " attribute to either your `config.json` or `tokenizer_config.json` "
+                "file to suppress this warning: ",
+                FutureWarning,
+            )
+
+            feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+            return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
+
+    def __call__(
+        self,
+        audio: Optional[AudioInput] = None,
+        text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
+        images=None,
+        videos=None,
+        **kwargs: Unpack[Wav2Vec2BertProcessorKwargs],
+    ):
+        """
+        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
+        and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audio` is not
+        `None` to pre-process the audio. To prepare the target sequences(s), this method forwards the `text` and `kwargs` arguments to
+        PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the docstring of the above two methods for more information.
+
+        Args:
+            audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
+                of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
+                and T the sample length of the audio.
+
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            - **input_features** -- Audio input features to be fed to a model. Returned when `audio` is not `None`.
+            - **attention_mask** -- List of indices specifying which timestamps should be attended to by the model when `audio` is not `None`.
+            When only `text` is specified, returns the token attention mask.
+            - **labels** -- List of token ids to be fed to a model. Returned when both `text` and `audio` are not `None`.
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None` and `audio` is `None`.
+        """
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+        output_kwargs = self._merge_kwargs(
+            Wav2Vec2BertProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"])
+        if text is not None:
+            encodings = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    def pad(self, input_features=None, labels=None, **kwargs):
+        """
+        If `input_features` is not `None`, this method forwards the `input_features` and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.pad`] to pad the input features.
+        If `labels` is not `None`, this method forwards the `labels` and `kwargs` arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.pad`] to pad the label(s).
+        Please refer to the docstring of the above two methods for more information.
+        """
+        if input_features is None and labels is None:
+            raise ValueError("You need to specify either an `input_features` or `labels` input to pad.")
+
+        if input_features is not None:
+            input_features = self.feature_extractor.pad(input_features, **kwargs)
+        if labels is not None:
+            labels = self.tokenizer.pad(labels, **kwargs)
+
+        if labels is None:
+            return input_features
+        elif input_features is None:
+            return labels
+        else:
+            input_features["labels"] = labels["input_ids"]
+            return input_features
+
+    @property
+    def model_input_names(self):
+        # The processor doesn't return text ids and the model seems to not need them
+        feature_extractor_input_names = self.feature_extractor.model_input_names
+        return feature_extractor_input_names + ["labels"]
+
+
+__all__ = ["Wav2Vec2BertProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc9424e769774b1f994546ac8f0fd016019b533d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__pycache__/configuration_wav2vec2_conformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__pycache__/configuration_wav2vec2_conformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d0bf164f0fcdad3c10f6379475c46ba0352db8a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__pycache__/configuration_wav2vec2_conformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__pycache__/modeling_wav2vec2_conformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__pycache__/modeling_wav2vec2_conformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68ae6290370c0302b66bd044468762c0c0682e74
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__pycache__/modeling_wav2vec2_conformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__pycache__/modular_wav2vec2_conformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__pycache__/modular_wav2vec2_conformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab4e3167f08bc6a6b98a11b6c3d905eecc607627
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__pycache__/modular_wav2vec2_conformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_phoneme/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_phoneme/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb12e9bcc559dac287ba7216cf96c60f2a968adb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_phoneme/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_phoneme/__pycache__/tokenization_wav2vec2_phoneme.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_phoneme/__pycache__/tokenization_wav2vec2_phoneme.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37fa05d84d806cf32f8f4aefb4273576a1d43d0e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_phoneme/__pycache__/tokenization_wav2vec2_phoneme.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/modeling_wavlm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/modeling_wavlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..a34b5d61d71a99fec9b9182f3f21c9243eff52d8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/modeling_wavlm.py
@@ -0,0 +1,1706 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/wavlm/modular_wavlm.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_wavlm.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import math
+import warnings
+from typing import Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, is_peft_available, logging
+from .configuration_wavlm import WavLMConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class WavLMSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+class WavLMPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+            if hasattr(self.conv, "parametrizations"):
+                weight_g = self.conv.parametrizations.weight.original0
+                weight_v = self.conv.parametrizations.weight.original1
+            else:
+                weight_g = self.conv.weight_g
+                weight_v = self.conv.weight_v
+            deepspeed.zero.register_external_parameter(self, weight_v)
+            deepspeed.zero.register_external_parameter(self, weight_g)
+        else:
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
+
+        self.padding = WavLMSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class WavLMFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+class WavLMAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        num_buckets: int = 320,
+        max_distance: int = 800,
+        has_relative_position_bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+
+        self.gru_rel_pos_const = nn.Parameter(torch.ones(1, self.num_heads, 1, 1))
+        self.gru_rel_pos_linear = nn.Linear(self.head_dim, 8)
+
+        if has_relative_position_bias:
+            self.rel_attn_embed = nn.Embedding(self.num_buckets, self.num_heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_bias: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        index=0,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Attention layer with relative attention"""
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # first pass of attention layer creates position bias
+        if position_bias is None:
+            position_bias = self.compute_bias(tgt_len, tgt_len)
+            position_bias = (
+                position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, tgt_len)
+            )
+
+        # Compute relative position bias:
+        # 1) get reshape hidden_states
+        gated_hidden_states = hidden_states.view(hidden_states.shape[:-1] + (self.num_heads, -1))
+        gated_hidden_states = gated_hidden_states.permute(0, 2, 1, 3)
+
+        # 2) project hidden states
+        relative_position_proj = self.gru_rel_pos_linear(gated_hidden_states)
+        relative_position_proj = relative_position_proj.view(gated_hidden_states.shape[:-1] + (2, 4)).sum(-1)
+
+        # 3) compute gate for position bias from projected hidden states
+        gate_a, gate_b = torch.sigmoid(relative_position_proj).chunk(2, dim=-1)
+        gate_output = gate_a * (gate_b * self.gru_rel_pos_const - 1.0) + 2.0
+
+        # 4) apply gate to position bias to compute gated position_bias
+        gated_position_bias = gate_output.view(bsz * self.num_heads, -1, 1) * position_bias
+        gated_position_bias = gated_position_bias.view((-1, tgt_len, tgt_len))
+
+        attn_output, attn_weights = self.torch_multi_head_self_attention(
+            hidden_states, attention_mask, gated_position_bias, output_attentions
+        )
+
+        return attn_output, attn_weights, position_bias
+
+    def torch_multi_head_self_attention(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Union[torch.LongTensor, torch.BoolTensor],
+        gated_position_bias: torch.FloatTensor,
+        output_attentions: bool,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
+        """simple wrapper around torch's multi_head_attention_forward function"""
+        # self-attention assumes q = k = v
+        query = key = value = hidden_states.transpose(0, 1)
+        key_padding_mask = attention_mask.ne(1) if attention_mask is not None else None
+
+        # disable bias and add_zero_attn
+        bias_k = bias_v = None
+        add_zero_attn = False
+
+        # PyTorch 1.3.0 has F.multi_head_attention_forward defined
+        # so no problem with backwards compatibility
+        attn_output, attn_weights = F.multi_head_attention_forward(
+            query,
+            key,
+            value,
+            self.embed_dim,
+            self.num_heads,
+            torch.empty([0]),
+            torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
+            bias_k,
+            bias_v,
+            add_zero_attn,
+            self.dropout,
+            self.out_proj.weight,
+            self.out_proj.bias,
+            self.training,
+            key_padding_mask,
+            output_attentions,
+            gated_position_bias,
+            use_separate_proj_weight=True,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+        )
+
+        # [Seq_Len, Batch Size, ...] -> [Batch Size, Seq_Len, ...]
+        attn_output = attn_output.transpose(0, 1)
+
+        if attn_weights is not None:
+            # IMPORTANT: Attention weights are averaged weights
+            # here which should not be the case. This is an open issue
+            # on PyTorch: https://github.com/pytorch/pytorch/issues/32590
+            attn_weights = attn_weights[:, None].broadcast_to(
+                attn_weights.shape[:1] + (self.num_heads,) + attn_weights.shape[1:]
+            )
+
+        return attn_output, attn_weights
+
+    def compute_bias(self, query_length: int, key_length: int) -> torch.FloatTensor:
+        context_position = torch.arange(query_length, dtype=torch.long)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position
+        relative_position_bucket = self._relative_positions_bucket(relative_position)
+        relative_position_bucket = relative_position_bucket.to(self.rel_attn_embed.weight.device)
+        values = self.rel_attn_embed(relative_position_bucket)
+        values = values.permute([2, 0, 1])
+        return values
+
+    def _relative_positions_bucket(self, relative_positions: torch.FloatTensor) -> torch.FloatTensor:
+        num_buckets = self.num_buckets // 2
+
+        relative_buckets = (relative_positions > 0).to(torch.long) * num_buckets
+        relative_positions = torch.abs(relative_positions)
+
+        max_exact = num_buckets // 2
+        is_small = relative_positions < max_exact
+
+        relative_positions_if_large = torch.log(relative_positions.float() / max_exact)
+        relative_positions_if_large = relative_positions_if_large / math.log(self.max_distance / max_exact)
+        relative_positions_if_large = relative_positions_if_large * (num_buckets - max_exact)
+        relative_position_if_large = (max_exact + relative_positions_if_large).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_positions, relative_position_if_large)
+        return relative_buckets
+
+
+class WavLMFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class WavLMEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False, index=0):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+            index=index,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class WavLMEncoderLayerStableLayerNorm(GradientCheckpointingLayer):
+    def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class WavLMEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = WavLMPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList(
+            [WavLMEncoderLayer(config, has_relative_position_bias=(i == 0)) for i in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+        position_bias = None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                layer_outputs = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_bias=position_bias,
+                    output_attentions=output_attentions,
+                    index=i,
+                )
+
+                hidden_states, position_bias = layer_outputs[:2]
+
+            if skip_the_layer:
+                layer_outputs = (None, None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class WavLMEncoderStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = WavLMPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList(
+            [
+                WavLMEncoderLayerStableLayerNorm(config, has_relative_position_bias=(i == 0))
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens are not attended to
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+        position_bias = None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
+                layer_outputs = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    output_attentions=output_attentions,
+                    position_bias=position_bias,
+                )
+                hidden_states, position_bias = layer_outputs[:2]
+
+            if skip_the_layer:
+                layer_outputs = (None, None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
+        )
+
+
+class WavLMGumbelVectorQuantizer(nn.Module):
+    """
+    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_groups = config.num_codevector_groups
+        self.num_vars = config.num_codevectors_per_group
+
+        if config.codevector_dim % self.num_groups != 0:
+            raise ValueError(
+                f"`config.codevector_dim {config.codevector_dim} must be divisible"
+                f" by `config.num_codevector_groups` {self.num_groups} "
+                "for concatenation."
+            )
+
+        # storage for codebook variables (codewords)
+        self.codevectors = nn.Parameter(
+            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
+        )
+        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
+
+        # can be decayed for training
+        self.temperature = 2
+
+    @staticmethod
+    def _compute_perplexity(probs):
+        marginal_probs = probs.mean(dim=0)
+        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+
+    def forward(self, hidden_states):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(hidden_states.float(), tau=self.temperature, hard=True)
+            codevector_probs = codevector_probs.type_as(hidden_states)
+
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+
+            perplexity = self._compute_perplexity(codevector_probs)
+
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+
+        return codevectors, perplexity
+
+
+@auto_docstring
+class WavLMPreTrainedModel(PreTrainedModel):
+    config: WavLMConfig
+    base_model_prefix = "wavlm"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn = False
+    _supports_sdpa = False
+    _supports_flex_attn = False
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # gumbel softmax requires special init
+        if isinstance(module, WavLMGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, WavLMPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, WavLMFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+class WavLMNoLayerNormConvLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class WavLMLayerNormConvLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class WavLMGroupNormConvLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class WavLMFeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        if config.feat_extract_norm == "group":
+            conv_layers = [WavLMGroupNormConvLayer(config, layer_id=0)] + [
+                WavLMNoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [WavLMLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.ModuleList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
+
+        for conv_layer in self.conv_layers:
+            hidden_states = conv_layer(hidden_states)
+
+        return hidden_states
+
+
+class WavLMAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.output_hidden_size,
+            2 * config.output_hidden_size,
+            config.adapter_kernel_size,
+            stride=config.adapter_stride,
+            padding=1,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+
+        return hidden_states
+
+
+class WavLMAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
+        else:
+            self.proj = self.proj_layer_norm = None
+
+        self.layers = nn.ModuleList(WavLMAdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+
+    def forward(self, hidden_states):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+
+        for layer in self.layers:
+            layerdrop_prob = np.random.random()
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+def _compute_mask_indices(
+    shape: tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+WavLMBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@auto_docstring
+class WavLMModel(WavLMPreTrainedModel):
+    def __init__(self, config: WavLMConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = WavLMFeatureEncoder(config)
+        self.feature_projection = WavLMFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        if config.do_stable_layer_norm:
+            self.encoder = WavLMEncoderStableLayerNorm(config)
+        else:
+            self.encoder = WavLMEncoder(config)
+
+        self.adapter = WavLMAdapter(config) if config.add_adapter else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.feature_extractor._freeze_parameters()
+
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://huggingface.co/papers/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+
+        return hidden_states
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, WavLMBaseModelOutput]:
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return WavLMBaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+
+@auto_docstring(
+    custom_intro="""
+    WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
+    """
+)
+class WavLMForCTC(WavLMPreTrainedModel):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        r"""
+        target_lang (`str`, *optional*):
+            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
+            adapter.<lang>.bin. Only relevant when using an instance of [`WavLMForCTC`] with adapters. Uses 'eng' by
+            default.
+        """
+        super().__init__(config)
+
+        self.wavlm = WavLMModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        self.target_lang = target_lang
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `WavLMForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
+        passing `target_lang=...` to `from_pretrained(...)`.
+
+        This method is **not** supposed to be called by the user and is prone to be changed in the future.
+        """
+
+        # Note that `tie_weights` is usually used to tie input and output embedding weights. The method is re-purposed to
+        # correctly load adapter layers for WavLM so that we do not have to introduce a new API to
+        # [`PreTrainedModel`]. While slightly hacky, WavLM never has to tie input and output embeddings, so that it is
+        # ok to repurpose this function here.
+        target_lang = self.target_lang
+
+        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
+            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
+        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
+            logger.info("By default `target_lang` is set to 'eng'.")
+        elif target_lang is not None:
+            self.load_adapter(target_lang, force_load=True)
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wavlm.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wavlm.parameters():
+            param.requires_grad = False
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        outputs = self.wavlm(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    WavLM Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
+    """
+)
+class WavLMForSequenceClassification(WavLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Sequence classification does not support the use of WavLM adapters (config.add_adapter=True)"
+            )
+        self.wavlm = WavLMModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wavlm.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wavlm.parameters():
+            param.requires_grad = False
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
+            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
+            into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wavlm(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class WavLMForAudioFrameClassification(WavLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Audio frame classification does not support the use of WavLM adapters (config.add_adapter=True)"
+            )
+        self.wavlm = WavLMModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.num_labels = config.num_labels
+
+        self.init_weights()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wavlm.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wavlm.parameters():
+            param.requires_grad = False
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
+            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
+            into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wavlm(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class AMSoftmaxLoss(nn.Module):
+    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
+        super().__init__()
+        self.scale = scale
+        self.margin = margin
+        self.num_labels = num_labels
+        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, hidden_states, labels):
+        labels = labels.flatten()
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
+        cos_theta = torch.mm(hidden_states, weight)
+        psi = cos_theta - self.margin
+
+        onehot = nn.functional.one_hot(labels, self.num_labels)
+        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
+        loss = self.loss(logits, labels)
+
+        return loss
+
+
+class TDNNLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
+        self.out_conv_dim = config.tdnn_dim[layer_id]
+        self.kernel_size = config.tdnn_kernel[layer_id]
+        self.dilation = config.tdnn_dilation[layer_id]
+
+        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
+        self.activation = nn.ReLU()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if is_peft_available():
+            from peft.tuners.lora import LoraLayer
+
+        if is_peft_available():
+            if isinstance(self.kernel, LoraLayer):
+                warnings.warn(
+                    "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
+                    "You should exclude TDNNLayer from LoRA's target modules.",
+                )
+
+        # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
+        hidden_states = hidden_states.transpose(1, 2)
+        weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)
+        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """
+)
+class WavLMForXVector(WavLMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.wavlm = WavLMModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
+
+        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
+        self.tdnn = nn.ModuleList(tdnn_layers)
+
+        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
+        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
+
+        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
+
+        self.init_weights()
+
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wavlm.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wavlm.parameters():
+            param.requires_grad = False
+
+    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the TDNN layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size in self.config.tdnn_kernel:
+            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
+
+        return input_lengths
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, XVectorOutput]:
+        r"""
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
+            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
+            into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wavlm(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "WavLMForAudioFrameClassification",
+    "WavLMForCTC",
+    "WavLMForSequenceClassification",
+    "WavLMForXVector",
+    "WavLMModel",
+    "WavLMPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/modular_wavlm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/modular_wavlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..75e360b6a1d35fef257f515358139af927665975
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/modular_wavlm.py
@@ -0,0 +1,588 @@
+import math
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, Wav2Vec2BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from ..wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2FeatureProjection,
+    Wav2Vec2FeedForward,
+    Wav2Vec2ForAudioFrameClassification,
+    Wav2Vec2ForCTC,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2ForXVector,
+    Wav2Vec2Model,
+    Wav2Vec2PositionalConvEmbedding,
+    Wav2Vec2PreTrainedModel,
+)
+from .configuration_wavlm import WavLMConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class WavLMPositionalConvEmbedding(Wav2Vec2PositionalConvEmbedding):
+    pass
+
+
+class WavLMFeatureProjection(Wav2Vec2FeatureProjection):
+    pass
+
+
+class WavLMAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        num_buckets: int = 320,
+        max_distance: int = 800,
+        has_relative_position_bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+
+        self.gru_rel_pos_const = nn.Parameter(torch.ones(1, self.num_heads, 1, 1))
+        self.gru_rel_pos_linear = nn.Linear(self.head_dim, 8)
+
+        if has_relative_position_bias:
+            self.rel_attn_embed = nn.Embedding(self.num_buckets, self.num_heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_bias: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        index=0,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Attention layer with relative attention"""
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # first pass of attention layer creates position bias
+        if position_bias is None:
+            position_bias = self.compute_bias(tgt_len, tgt_len)
+            position_bias = (
+                position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, tgt_len)
+            )
+
+        # Compute relative position bias:
+        # 1) get reshape hidden_states
+        gated_hidden_states = hidden_states.view(hidden_states.shape[:-1] + (self.num_heads, -1))
+        gated_hidden_states = gated_hidden_states.permute(0, 2, 1, 3)
+
+        # 2) project hidden states
+        relative_position_proj = self.gru_rel_pos_linear(gated_hidden_states)
+        relative_position_proj = relative_position_proj.view(gated_hidden_states.shape[:-1] + (2, 4)).sum(-1)
+
+        # 3) compute gate for position bias from projected hidden states
+        gate_a, gate_b = torch.sigmoid(relative_position_proj).chunk(2, dim=-1)
+        gate_output = gate_a * (gate_b * self.gru_rel_pos_const - 1.0) + 2.0
+
+        # 4) apply gate to position bias to compute gated position_bias
+        gated_position_bias = gate_output.view(bsz * self.num_heads, -1, 1) * position_bias
+        gated_position_bias = gated_position_bias.view((-1, tgt_len, tgt_len))
+
+        attn_output, attn_weights = self.torch_multi_head_self_attention(
+            hidden_states, attention_mask, gated_position_bias, output_attentions
+        )
+
+        return attn_output, attn_weights, position_bias
+
+    def torch_multi_head_self_attention(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Union[torch.LongTensor, torch.BoolTensor],
+        gated_position_bias: torch.FloatTensor,
+        output_attentions: bool,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
+        """simple wrapper around torch's multi_head_attention_forward function"""
+        # self-attention assumes q = k = v
+        query = key = value = hidden_states.transpose(0, 1)
+        key_padding_mask = attention_mask.ne(1) if attention_mask is not None else None
+
+        # disable bias and add_zero_attn
+        bias_k = bias_v = None
+        add_zero_attn = False
+
+        # PyTorch 1.3.0 has F.multi_head_attention_forward defined
+        # so no problem with backwards compatibility
+        attn_output, attn_weights = F.multi_head_attention_forward(
+            query,
+            key,
+            value,
+            self.embed_dim,
+            self.num_heads,
+            torch.empty([0]),
+            torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
+            bias_k,
+            bias_v,
+            add_zero_attn,
+            self.dropout,
+            self.out_proj.weight,
+            self.out_proj.bias,
+            self.training,
+            key_padding_mask,
+            output_attentions,
+            gated_position_bias,
+            use_separate_proj_weight=True,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+        )
+
+        # [Seq_Len, Batch Size, ...] -> [Batch Size, Seq_Len, ...]
+        attn_output = attn_output.transpose(0, 1)
+
+        if attn_weights is not None:
+            # IMPORTANT: Attention weights are averaged weights
+            # here which should not be the case. This is an open issue
+            # on PyTorch: https://github.com/pytorch/pytorch/issues/32590
+            attn_weights = attn_weights[:, None].broadcast_to(
+                attn_weights.shape[:1] + (self.num_heads,) + attn_weights.shape[1:]
+            )
+
+        return attn_output, attn_weights
+
+    def compute_bias(self, query_length: int, key_length: int) -> torch.FloatTensor:
+        context_position = torch.arange(query_length, dtype=torch.long)[:, None]
+        memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position
+        relative_position_bucket = self._relative_positions_bucket(relative_position)
+        relative_position_bucket = relative_position_bucket.to(self.rel_attn_embed.weight.device)
+        values = self.rel_attn_embed(relative_position_bucket)
+        values = values.permute([2, 0, 1])
+        return values
+
+    def _relative_positions_bucket(self, relative_positions: torch.FloatTensor) -> torch.FloatTensor:
+        num_buckets = self.num_buckets // 2
+
+        relative_buckets = (relative_positions > 0).to(torch.long) * num_buckets
+        relative_positions = torch.abs(relative_positions)
+
+        max_exact = num_buckets // 2
+        is_small = relative_positions < max_exact
+
+        relative_positions_if_large = torch.log(relative_positions.float() / max_exact)
+        relative_positions_if_large = relative_positions_if_large / math.log(self.max_distance / max_exact)
+        relative_positions_if_large = relative_positions_if_large * (num_buckets - max_exact)
+        relative_position_if_large = (max_exact + relative_positions_if_large).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += torch.where(is_small, relative_positions, relative_position_if_large)
+        return relative_buckets
+
+
+class WavLMFeedForward(Wav2Vec2FeedForward):
+    pass
+
+
+class WavLMEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False, index=0):
+        attn_residual = hidden_states
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+            index=index,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class WavLMEncoderLayerStableLayerNorm(GradientCheckpointingLayer):
+    def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True):
+        super().__init__()
+        self.attention = WavLMAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+            num_buckets=config.num_buckets,
+            max_distance=config.max_bucket_distance,
+            has_relative_position_bias=has_relative_position_bias,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.feed_forward = WavLMFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False):
+        attn_residual = hidden_states
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states, attn_weights, position_bias = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = attn_residual + hidden_states
+        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+        outputs = (hidden_states, position_bias)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class WavLMEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = WavLMPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList(
+            [WavLMEncoderLayer(config, has_relative_position_bias=(i == 0)) for i in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+        position_bias = None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                layer_outputs = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_bias=position_bias,
+                    output_attentions=output_attentions,
+                    index=i,
+                )
+
+                hidden_states, position_bias = layer_outputs[:2]
+
+            if skip_the_layer:
+                layer_outputs = (None, None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class WavLMEncoderStableLayerNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.pos_conv_embed = WavLMPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList(
+            [
+                WavLMEncoderLayerStableLayerNorm(config, has_relative_position_bias=(i == 0))
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens are not attended to
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0
+
+        position_embeddings = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + position_embeddings
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+        position_bias = None
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
+                layer_outputs = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    output_attentions=output_attentions,
+                    position_bias=position_bias,
+                )
+                hidden_states, position_bias = layer_outputs[:2]
+
+            if skip_the_layer:
+                layer_outputs = (None, None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
+        )
+
+
+class WavLMGumbelVectorQuantizer(nn.Module):
+    """
+    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_groups = config.num_codevector_groups
+        self.num_vars = config.num_codevectors_per_group
+
+        if config.codevector_dim % self.num_groups != 0:
+            raise ValueError(
+                f"`config.codevector_dim {config.codevector_dim} must be divisible"
+                f" by `config.num_codevector_groups` {self.num_groups} "
+                "for concatenation."
+            )
+
+        # storage for codebook variables (codewords)
+        self.codevectors = nn.Parameter(
+            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
+        )
+        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
+
+        # can be decayed for training
+        self.temperature = 2
+
+    @staticmethod
+    def _compute_perplexity(probs):
+        marginal_probs = probs.mean(dim=0)
+        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+
+    def forward(self, hidden_states):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(hidden_states.float(), tau=self.temperature, hard=True)
+            codevector_probs = codevector_probs.type_as(hidden_states)
+
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+
+            perplexity = self._compute_perplexity(codevector_probs)
+
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+
+        return codevectors, perplexity
+
+
+class WavLMPreTrainedModel(PreTrainedModel, Wav2Vec2PreTrainedModel):
+    config: WavLMConfig
+    base_model_prefix = "wavlm"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn = False
+    _supports_sdpa = False
+    _supports_flex_attn = False
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # gumbel softmax requires special init
+        if isinstance(module, WavLMGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, WavLMPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, WavLMFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_adapters(self):
+        raise AttributeError("Not needed for WavLM")
+
+    def init_adapter_layers(self):
+        raise AttributeError("Not needed for WavLM")
+
+    def load_adapter(self):
+        raise AttributeError("Not needed for WavLM")
+
+
+WavLMBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+class WavLMModel(Wav2Vec2Model):
+    pass
+
+
+class WavLMForCTC(Wav2Vec2ForCTC):
+    pass
+
+
+class WavLMForSequenceClassification(Wav2Vec2ForSequenceClassification):
+    pass
+
+
+class WavLMForAudioFrameClassification(Wav2Vec2ForAudioFrameClassification):
+    pass
+
+
+class WavLMForXVector(Wav2Vec2ForXVector):
+    pass
+
+
+__all__ = [
+    "WavLMForAudioFrameClassification",
+    "WavLMForCTC",
+    "WavLMForSequenceClassification",
+    "WavLMForXVector",
+    "WavLMModel",
+    "WavLMPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..009961255ec729cca3fbbd24f5a92725ceac7942
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/configuration_whisper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/configuration_whisper.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ab1b56a97ab4f6ee7bc1f8a02c92f5dd1872c20
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/configuration_whisper.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/modeling_flax_whisper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/modeling_flax_whisper.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b44c7704ed59e8380d21e065c37625f94fbffb82
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/modeling_flax_whisper.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/modeling_tf_whisper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/modeling_tf_whisper.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05e144b829b07ce3f19e944eb73a2bfd0876dedd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/modeling_tf_whisper.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/processing_whisper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/processing_whisper.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23af7e7837a4ca6c467b1b1bb1d95d0e20f6fed9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/processing_whisper.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/tokenization_whisper_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/tokenization_whisper_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3541d31bce116a339c02dc119f104b7b63704395
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/tokenization_whisper_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xcodec/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xcodec/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8879c2a17681b93a6a1afe01728004d41dfabee
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xcodec/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xcodec/__pycache__/configuration_xcodec.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xcodec/__pycache__/configuration_xcodec.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..981106c066833f4ddba469cb3dd198fbf354167a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xcodec/__pycache__/configuration_xcodec.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xcodec/__pycache__/modeling_xcodec.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xcodec/__pycache__/modeling_xcodec.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4870aa2d5fdfa48ee9d10edeefe9c07b455cf365
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xcodec/__pycache__/modeling_xcodec.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8967c3603f2666c7e0a580cf4e3d627d2bdb7cf0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm/__pycache__/configuration_xlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm/__pycache__/configuration_xlm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be811f79797498052800cfe45e32f4e9b4fe4d82
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm/__pycache__/configuration_xlm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm/__pycache__/modeling_tf_xlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm/__pycache__/modeling_tf_xlm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a486efd5294965abf7ef61cafacc41392e665fb6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm/__pycache__/modeling_tf_xlm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm/__pycache__/modeling_xlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm/__pycache__/modeling_xlm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..faa4b0c63de6a291be87eefdc342c189cc3f685b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm/__pycache__/modeling_xlm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm/__pycache__/tokenization_xlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm/__pycache__/tokenization_xlm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21ed572f707e1ba5e09d3d01ef531fb42d46b3dd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm/__pycache__/tokenization_xlm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm_roberta_xl/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm_roberta_xl/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10d876ba8681c6be8786a7c710aaed2d661d6993
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm_roberta_xl/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm_roberta_xl/__pycache__/configuration_xlm_roberta_xl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm_roberta_xl/__pycache__/configuration_xlm_roberta_xl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..236af54c3be2e041714ea799d294ce3c632058f6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm_roberta_xl/__pycache__/configuration_xlm_roberta_xl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm_roberta_xl/__pycache__/modeling_xlm_roberta_xl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm_roberta_xl/__pycache__/modeling_xlm_roberta_xl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..875d8da7285578ff2244229aa5f01c7cd0e44728
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlm_roberta_xl/__pycache__/modeling_xlm_roberta_xl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..00e206973a908d82e9b8fe38a3d016fcced9ecd3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2025 NXAI GmbH. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from configuration_xlstm import *
+    from modeling_xlstm import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/configuration_xlstm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/configuration_xlstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a9a13fe3e554cacb70d846e6798c92e4eaf703
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/configuration_xlstm.py
@@ -0,0 +1,302 @@
+# Copyright 2025 NXAI GmbH. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""xLSTM configuration."""
+
+from typing import Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import is_xlstm_available, logging
+
+
+if is_xlstm_available():
+    from xlstm.xlstm_large.model import (
+        BackendModeType,
+        ChunkwiseKernelType,
+        DtypeType,
+        SequenceKernelType,
+        StepKernelType,
+        WeightModeType,
+        round_up_to_next_multiple_of,
+        xLSTMLargeConfig,
+    )
+
+    external_xlstm = True
+else:
+    from typing import Literal
+
+    BackendModeType = Literal["train", "train_with_padding", "inference"]
+    ChunkwiseKernelType = Literal[
+        "chunkwise--native_autograd",
+        "parallel--native_autograd",
+    ]
+    DtypeType = Literal["float32", "bfloat16", "float16"]
+    SequenceKernelType = Literal["native_sequence__native"]
+    StepKernelType = Literal["native"]
+    WeightModeType = Literal["single", "fused"]
+
+    def round_up_to_next_multiple_of(x: int, multiple_of: int) -> int:
+        """Rounds up x to the next multiple of multiple_of."""
+        return int(((x + multiple_of - 1) // multiple_of) * multiple_of)
+
+    external_xlstm = False
+
+
+logger = logging.get_logger(__name__)
+
+
+class xLSTMConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`xLSTM`]. It is used to instantiate a xLSTM
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the xLSTM-7b [NX-AI/xLSTM-7b](https://huggingface.co/NX-AI/xLSTM-7b) model.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (int, optional, *optional*, defaults to 50304):
+            Vocabulary size of the xLSTM model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`xLSTMModel`]. Defaults to the GPT2-NeoX tokenizer size.
+        hidden_size (int, optional, *optional*, defaults to 4096):
+            Dimensionality of the embeddings or hidden states.
+        embedding_dim (int, optional, *optional*, defaults to 4096):
+            Dimensionality of the embeddings or hidden states, use hidde_size if None.
+        num_hidden_layers (int, optional, *optional*, defaults to 32):
+            Number of blocks of the xLSTM model.
+        num_blocks (int, optional, *optional*, defaults to 32):
+            Number of blocks of the xLSTM model, use num_hidden_layers if None.
+        num_heads (int, optional, *optional*, defaults to 8):
+            Number of heads for the xLSTM Layer/Cell.
+        use_bias (bool, optional, *optional*, defaults to `False`):
+            Whether to use biases in the xLSTM model.
+        norm_reduction_force_float32 (bool, optional, *optional*, defaults to `True`):
+            Whether to force the float32 norm reduction op to be done in fp32 precision.
+        tie_word_embeddings (bool, optional, *optional*, defaults to `False`):
+            Whether to tie word embeddings to the lm head weights.
+        add_out_norm (bool, optional, *optional*, defaults to `True`):
+            Whether to add an output norm after the blocks before the LMHead.
+        norm_eps (float, optional, *optional*, defaults to 1e-06):
+            Norm eps for RMSNorm and Layer Norm.
+        qk_dim_factor (float, optional, *optional*, defaults to 0.5):
+            Scale factor for the query and key dimension.
+        v_dim_factor (float, optional, *optional*, defaults to 1.0):
+            Scale factor for the value dimension.
+        chunkwise_kernel (ChunkwiseKernelType, optional, *optional*, defaults to `"chunkwise--native_autograd"`):
+            Kernel type for chunkwise processing mode.
+        sequence_kernel (SequenceKernelType, optional, *optional*, defaults to `"native_sequence__native"`):
+            Kernel type for sequence processing mode.
+        step_kernel (StepKernelType, optional, *optional*, defaults to `"native"`):
+            Kernel type for step processing mode.
+        mode (BackendModeType, optional, *optional*, defaults to `"inference"`):
+            Operation mode (inference is needed for generation).
+        chunk_size (int, optional, *optional*, defaults to 64):
+            Internal chunk size.
+        return_last_states (bool, optional, *optional*, defaults to `True`):
+            If to return the last states / cache internally. Needed as True for generation.
+        autocast_kernel_dtype (DtypeType, optional, *optional*, defaults to `"bfloat16"`):
+            Kernel dtype for the states.
+        eps (float, optional, *optional*, defaults to 1e-06):
+            Epsilon for the mLSTM cell post norm.
+        inference_state_dtype (DtypeType, optional, *optional*, defaults to `"float32"`):
+            Kernel dtype for states in inference.
+        ffn_proj_factor (float, optional, *optional*, defaults to 2.667):
+            Size factor of the post-up projection gated Feed Forward network.
+        ffn_round_up_to_multiple_of (int, optional, *optional*, defaults to 64):
+            Size factor round value of the post-up projection gated Feed Forward network.
+        gate_soft_cap (float, optional, *optional*, defaults to 15.0):
+            Gate soft cap scale.
+        output_logit_soft_cap (float, optional, *optional*, defaults to 30.0):
+            Output logit soft cap scale.
+        weight_mode (`Literal`, *optional*, defaults to `"single"`):
+            Whether parallel linear layers are separated or fused (single).
+        use_cache (bool, optional, *optional*, defaults to `True`):
+            Whether to use the cache (xLSTMCache).
+        pad_token_id (int, optional, *optional*, defaults to 1):
+            Pad token id needed for generation.
+        bos_token_id (int, optional, *optional*, defaults to 0):
+            BOS token id needed for generation.
+        eos_token_id (int, optional, *optional*, defaults to 2):
+            EOS token id needed for generation.
+        max_inference_chunksize (int, optional, *optional*, defaults to 16384):
+            Limit the chunk size for inference to save memory.
+
+    Example:
+
+    ```python
+    >>> from transformers import xLSTMConfig, xLSTMModel
+
+    >>> # Initializing a xLSTM configuration
+    >>> configuration = xLSTMConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = xLSTMModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "xlstm"
+
+    def __init__(
+        self,
+        vocab_size: int = 50304,
+        hidden_size: int = 4096,
+        embedding_dim: Optional[int] = None,
+        num_hidden_layers: Optional[int] = 32,
+        num_blocks: Optional[int] = None,
+        num_heads: int = 8,
+        use_bias: bool = False,
+        norm_reduction_force_float32: bool = True,
+        tie_word_embeddings: bool = False,
+        add_out_norm: bool = True,
+        norm_eps: float = 1e-6,
+        # mlstm_layer
+        qk_dim_factor: float = 0.5,
+        v_dim_factor: float = 1.0,
+        # mlstm backend
+        chunkwise_kernel: ChunkwiseKernelType = "chunkwise--native_autograd",
+        sequence_kernel: SequenceKernelType = "native_sequence__native",
+        step_kernel: StepKernelType = "native",
+        # needed to enable generation
+        mode: BackendModeType = "inference",
+        chunk_size: int = 64,
+        # needed to be true for generation
+        return_last_states: bool = True,
+        autocast_kernel_dtype: DtypeType = "bfloat16",
+        eps: float = 1e-6,
+        inference_state_dtype: DtypeType = "float32",
+        # feedforward
+        ffn_proj_factor: float = 2.667,
+        ffn_round_up_to_multiple_of: int = 64,
+        # capping
+        gate_soft_cap: float = 15.0,
+        output_logit_soft_cap: float = 30.0,
+        # weights
+        weight_mode: WeightModeType = "single",
+        # HF interface
+        use_cache: bool = True,
+        pad_token_id: int = 1,
+        bos_token_id: int = 0,
+        eos_token_id: int = 2,
+        max_inference_chunksize: int = 16384,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size if hidden_size is not None else embedding_dim
+        self.embedding_dim = embedding_dim if embedding_dim is not None else hidden_size
+        self.num_hidden_layers = num_hidden_layers if num_hidden_layers is not None else num_blocks
+        self.num_blocks = num_blocks if num_blocks is not None else num_hidden_layers
+        self.num_heads = num_heads
+        self.use_bias = use_bias
+        self.tie_word_embeddings = tie_word_embeddings
+        self.add_out_norm = add_out_norm
+        self.norm_eps = norm_eps
+        self.norm_reduction_force_float32 = norm_reduction_force_float32
+        # mlstm_layer
+        self.qk_dim_factor = qk_dim_factor
+        self.v_dim_factor = v_dim_factor
+        # mlstm backend
+        self.chunkwise_kernel = chunkwise_kernel
+        self.sequence_kernel = sequence_kernel
+        self.step_kernel = step_kernel
+        self.mode = mode
+        self.chunk_size = chunk_size
+        self.return_last_states = return_last_states
+        self.autocast_kernel_dtype = autocast_kernel_dtype
+        self.eps = eps
+        self.inference_state_dtype = inference_state_dtype
+        # feedforward
+        self.ffn_proj_factor = ffn_proj_factor
+        self.ffn_round_up_to_multiple_of = ffn_round_up_to_multiple_of
+        # capping
+        self.gate_soft_cap = gate_soft_cap
+        self.output_logit_soft_cap = output_logit_soft_cap
+        self.weight_mode = weight_mode
+
+        self.use_cache = use_cache
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.max_inference_chunksize = max_inference_chunksize
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def qk_dim(self):
+        return round_up_to_next_multiple_of(
+            self.hidden_size * self.qk_dim_factor,
+            multiple_of=64,
+        )
+
+    @property
+    def v_dim(self):
+        return round_up_to_next_multiple_of(
+            self.hidden_size * self.v_dim_factor,
+            multiple_of=64,
+        )
+
+    @property
+    def qk_head_dim(self):
+        return self.qk_dim // self.num_heads
+
+    @property
+    def v_head_dim(self):
+        return self.v_dim // self.num_heads
+
+    def to_xlstm_block_config(self):
+        if external_xlstm:
+            return xLSTMLargeConfig(
+                vocab_size=self.vocab_size,
+                embedding_dim=self.hidden_size,
+                num_blocks=self.num_hidden_layers,
+                num_heads=self.num_heads,
+                use_bias=self.use_bias,
+                add_out_norm=self.add_out_norm,
+                norm_eps=self.norm_eps,
+                norm_reduction_force_float32=self.norm_reduction_force_float32,
+                # mlstm_layer
+                qk_dim_factor=self.qk_dim_factor,
+                v_dim_factor=self.v_dim_factor,
+                # mlstm backend
+                chunkwise_kernel=self.chunkwise_kernel,
+                sequence_kernel=self.sequence_kernel,
+                step_kernel=self.step_kernel,
+                mode=self.mode,
+                chunk_size=self.chunk_size,
+                return_last_states=self.return_last_states,
+                autocast_kernel_dtype=self.autocast_kernel_dtype,
+                eps=self.eps,
+                inference_state_dtype=self.inference_state_dtype,
+                # feedforward
+                ffn_proj_factor=self.ffn_proj_factor,
+                ffn_round_up_to_multiple_of=self.ffn_round_up_to_multiple_of,
+                # capping
+                gate_soft_cap=self.gate_soft_cap,
+                output_logit_soft_cap=self.output_logit_soft_cap,
+                weight_mode=self.weight_mode,
+            )
+        else:
+            return self
+
+
+__all__ = ["xLSTMConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/modeling_xlstm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/modeling_xlstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd577c0c0bac80bd6718129abc83914963508594
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/modeling_xlstm.py
@@ -0,0 +1,1629 @@
+# Copyright 2025 NXAI GmbH. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch xLSTM Model."""
+
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...generation import GenerationMixin
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, auto_docstring, can_return_tuple, is_xlstm_available
+from .configuration_xlstm import xLSTMConfig
+
+
+if is_xlstm_available():
+    from xlstm.xlstm_large.model import RMSNorm as xLSTMRMSNorm
+    from xlstm.xlstm_large.model import mLSTMBlock as xLSTMBlock
+    from xlstm.xlstm_large.model import mLSTMStateType, soft_cap
+
+    external_xlstm = True
+else:
+    from functools import partial
+    from typing import Callable, Literal
+
+    from .configuration_xlstm import round_up_to_next_multiple_of
+
+    mLSTMLayerStateType = tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+    mLSTMStateType = dict[int, mLSTMLayerStateType]
+
+    external_xlstm = False
+
+    def soft_cap(values: torch.Tensor, cap_value: Optional[Union[float, torch.Tensor]] = None) -> torch.Tensor:
+        """
+        Soft caps a tensor to a value.
+
+        Performs a tanh operation on the logits and scales the result to the cap value. Common technique in attention
+        and output language heads to prevent large logits from dominating the softmax. See for example Gemma2:
+        https://huggingface.co/papers/2408.00118
+
+        Args:
+            values: The tensor to cap.
+            cap_value: The value to cap the values to. If None, no cap is applied.
+
+        Returns:
+            The capped values.
+        """
+        if cap_value is None:
+            return values
+        return cap_value * torch.tanh(values / cap_value)
+
+    def mlstm_chunkwise_recurrent_fw_C(
+        matK: torch.Tensor,
+        matV: torch.Tensor,
+        vecB: torch.Tensor,
+        vecI: torch.Tensor,
+        matC_states: Optional[torch.Tensor] = None,
+        vecN_states: Optional[torch.Tensor] = None,
+        scaMinter_states: Optional[torch.Tensor] = None,
+        matC_initial: Optional[torch.Tensor] = None,
+        vecN_initial: Optional[torch.Tensor] = None,
+        scaMinter_initial: Optional[torch.Tensor] = None,
+        qk_scale: Optional[float] = None,
+        chunk_size: int = 64,
+        num_chunks: int = 1,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        batch_size, nh, _, dhqk, dhhv = *matK.shape, matV.shape[-1]
+        nc = num_chunks
+        _dtype, _device = matK.dtype, matK.device
+
+        if qk_scale is None:
+            qk_scale = dhqk**-0.5
+
+        # initialize the states tensors
+        if matC_states is None:
+            matC_states = torch.zeros((batch_size, nh, (nc + 1) * dhqk, dhhv), dtype=_dtype, device=_device)
+        if vecN_states is None:
+            vecN_states = torch.zeros((batch_size, nh, (nc + 1) * dhqk), dtype=_dtype, device=_device)
+        if scaMinter_states is None:
+            scaMinter_states = torch.zeros((batch_size, nh, (nc + 1)), dtype=_dtype, device=_device)
+
+        # assign the initial states to the running states
+        matC_k = (
+            torch.zeros((batch_size, nh, dhqk, dhhv), dtype=_dtype, device=_device)
+            if matC_initial is None
+            else matC_initial
+        )
+        vecN_k = (
+            torch.zeros((batch_size, nh, dhqk), dtype=_dtype, device=_device) if vecN_initial is None else vecN_initial
+        )
+        scaM_inter_k = (
+            torch.zeros((batch_size, nh, 1), dtype=_dtype, device=_device)
+            if scaMinter_initial is None
+            else scaMinter_initial
+        )
+        vecA = vecB[..., -1, None] - vecB + vecI
+        scaG = vecB[..., -1]
+        scaA_max = vecA.max(-1).values
+
+        scaM_inter_k = scaM_inter_k.squeeze(-1)
+
+        for key in range(0, num_chunks):
+            # store the states from the previous iteration before updating them
+            # in the first iteration, these are the initial states
+            matC_states[:, :, key * dhqk : (key + 1) * dhqk, :] = matC_k
+            vecN_states[:, :, key * dhqk : (key + 1) * dhqk] = vecN_k
+            scaMinter_states[:, :, key] = scaM_inter_k
+
+            # m_k update
+            scaA_max_k = scaA_max[:, :, key]
+            scaG_k = scaG[:, :, key]
+            scaM_inter_k_next = torch.max(scaG_k + scaM_inter_k, scaA_max_k)
+            # C_k update
+            matK_chunk = matK[:, :, key * chunk_size : (key + 1) * chunk_size, :]  # * qk_scale
+            matV_chunk = matV[:, :, key * chunk_size : (key + 1) * chunk_size, :]
+            vecA_k = vecA[:, :, key, :]
+
+            vecAbar_k = torch.exp(vecA_k - scaM_inter_k_next[..., None])[:, :, :, None]
+
+            matK_chunk_gated = matK_chunk * vecAbar_k
+
+            scaGbar_k = torch.exp(scaG_k + scaM_inter_k - scaM_inter_k_next)[:, :, None]
+
+            # NOTE: no update in-place (i.e. +=) as this gives error for autograd backward
+            matC_k_next = scaGbar_k[..., None] * matC_k + matK_chunk_gated.transpose(-2, -1) @ (matV_chunk)
+
+            # n_k update
+            vecN_k_next = scaGbar_k * vecN_k + matK_chunk_gated.transpose(-2, -1).sum(-1)
+
+            # move to the next iteration
+            scaM_inter_k = scaM_inter_k_next
+            matC_k = matC_k_next
+            vecN_k = vecN_k_next
+
+        # store the states from the last iteration
+        matC_states[:, :, -dhqk:, :] = matC_k
+        vecN_states[:, :, -dhqk:] = vecN_k
+        scaMinter_states[:, :, -1] = scaM_inter_k
+
+        return matC_states, vecN_states, scaMinter_states
+
+    def mlstm_chunkwise_parallel_fw_H(
+        matQ: torch.Tensor,
+        matK: torch.Tensor,
+        matV: torch.Tensor,
+        # these states must be all states up to the last chunk, i.e. :-1
+        matC_states: torch.Tensor,
+        vecN_states: torch.Tensor,
+        scaMinter_states: torch.Tensor,
+        vecI: torch.Tensor,
+        vecB: torch.Tensor,
+        qk_scale: float,
+        chunk_size: int = 64,
+        num_chunks: int = 1,
+        eps: float = 1e-6,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        _device = matQ.device
+        nc = num_chunks
+        batch_size, nh, dqk, dhv = matC_states.shape
+        matC_k_states = matC_states.view(batch_size, nh, nc, dqk // nc, dhv)
+        vecN_k_states = vecN_states.view(batch_size, nh, nc, dqk // nc)
+        scaMinter_k_states = scaMinter_states
+
+        matQ = matQ.view(batch_size, nh, nc, chunk_size, dqk)
+        matK = matK.view(batch_size, nh, nc, chunk_size, dqk)
+        matV = matV.view(batch_size, nh, nc, chunk_size, dhv)
+
+        ltr = torch.tril(
+            torch.ones(
+                (chunk_size, chunk_size),
+                dtype=torch.bool,
+                device=_device,
+            )
+        )
+
+        # Compute intra chunk contribution: H_intra
+        matF_logsig_chunk = vecB[:, :, :, :, None] - vecB[:, :, :, None, :]
+
+        matF_logsig_mask_chunk = torch.where(ltr, matF_logsig_chunk, -float("inf"))
+
+        matLogD_chunk = matF_logsig_mask_chunk + vecI[:, :, :, None, :]
+
+        # max_state intra
+        vecMintra_k = torch.max(matLogD_chunk, dim=-1, keepdim=False).values
+
+        # max_state combined
+        vecM_b_inter = vecB + scaMinter_k_states[:, :, :, None]
+        vecM_k_combine = torch.maximum(vecM_b_inter, vecMintra_k)
+
+        vecM_k_combine = vecM_k_combine[:, :, :, :, None]
+        vecM_b_inter = vecM_b_inter[:, :, :, :, None]
+
+        matLogD_stabilized_chunk = matLogD_chunk - vecM_k_combine
+        matD_chunk = torch.exp(matLogD_stabilized_chunk)
+
+        matS_chunk = (matQ @ matK.transpose(-2, -1)) * qk_scale
+
+        matM_chunk = matS_chunk * matD_chunk
+
+        # ? Combine H_intra with H_inter
+        vecBbar = torch.exp(vecM_b_inter - vecM_k_combine)
+        matQ_chunk_gated = matQ * vecBbar * qk_scale
+
+        matNumerator_common = matQ_chunk_gated @ matC_k_states + matM_chunk @ matV
+
+        vecDenom_l_common = matQ_chunk_gated @ vecN_k_states.unsqueeze(-1) + matM_chunk.sum(dim=-1, keepdim=True)
+
+        vecDenom_max_common = torch.maximum(torch.abs(vecDenom_l_common), torch.exp(-vecM_k_combine))
+
+        matH_k_chunk = matNumerator_common / (vecDenom_max_common + eps)
+
+        matH_out = matH_k_chunk.view(batch_size, nh, nc * chunk_size, dhv)
+
+        # we need the denominator and the overall max state for the backward pass
+        vecN_out = vecDenom_max_common.reshape(batch_size, nh, nc * chunk_size)
+        vecM_out = vecM_k_combine(batch_size, nh, nc * chunk_size)
+        return matH_out, vecN_out, vecM_out
+
+    def mlstm_chunkwise_fw(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        igate: torch.Tensor,
+        fgate: torch.Tensor,
+        cstate: Optional[torch.Tensor] = None,
+        nstate: Optional[torch.Tensor] = None,
+        mstate: Optional[torch.Tensor] = None,
+        qk_scale: Optional[float] = None,
+        return_last_states: bool = False,
+        return_all_states: bool = False,
+        chunk_size: int = 64,
+        eps: float = 1e-6,
+    ) -> tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        Optional[tuple[torch.Tensor, torch.Tensor, torch.Tensor]],
+        Optional[tuple[torch.Tensor, torch.Tensor, torch.Tensor]],
+    ]:
+        batch_size, nh, sequence_length, dhqk = query.shape
+        if sequence_length % chunk_size != 0:
+            raise ValueError(f"Sequence length {sequence_length} is not divisible by chunk size {chunk_size}.")
+        nc = sequence_length // chunk_size
+
+        vecI = igate.view(batch_size, nh, nc, chunk_size)
+        vecF = fgate.view(batch_size, nh, nc, chunk_size)
+
+        # compute the gates, the g and the a and b vectors
+        vecF_logsig = fgate.logsigmoid(vecF)
+        vecB = vecF_logsig.cumsum(-1)
+
+        if qk_scale is None:
+            qk_scale = dhqk**-0.5
+
+        #! materialize the  C_k, n_k, m_k states for each chunk
+        matC_k_states, vecN_k_states, scaMinter_k_states = mlstm_chunkwise_recurrent_fw_C(
+            matK=key,
+            matV=value,
+            vecB=vecB,
+            vecI=vecI,
+            matC_initial=cstate,
+            vecN_initial=nstate,
+            scaMinter_initial=mstate,
+            qk_scale=qk_scale,
+            chunk_size=chunk_size,
+            num_chunks=nc,
+        )
+
+        #! compute the outputs within each chunk
+        matH_out, vecN_out, vecM_out = mlstm_chunkwise_parallel_fw_H(
+            matQ=query,
+            matK=key,
+            matV=value,
+            matC_states=matC_k_states[:, :, :-dhqk, :],
+            vecN_states=vecN_k_states[:, :, :-dhqk],
+            scaMinter_states=scaMinter_k_states[:, :, :-1],
+            vecI=vecI,
+            vecB=vecB,
+            qk_scale=qk_scale,
+            chunk_size=chunk_size,
+            num_chunks=nc,
+            eps=eps,
+        )
+
+        ret_tuple = (matH_out, vecN_out, vecM_out)
+        if return_last_states:
+            ret_tuple += (
+                (matC_k_states[:, :, -dhqk:, :], vecN_k_states[:, :, -dhqk:], scaMinter_k_states[:, :, -1:]),
+            )
+        else:
+            ret_tuple += (None,)
+
+        if return_all_states:
+            ret_tuple += ((matC_k_states, vecN_k_states, scaMinter_k_states),)
+        else:
+            ret_tuple += (None,)
+
+        return ret_tuple
+
+    def mlstm_chunkwise_native_autograd(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        igate: torch.Tensor,
+        fgate: torch.Tensor,
+        c_initial: Optional[torch.Tensor] = None,
+        n_initial: Optional[torch.Tensor] = None,
+        m_initial: Optional[torch.Tensor] = None,
+        return_last_states: bool = False,
+        eps: float = 1e-6,
+        chunk_size: int = 64,
+        **kwargs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]]:
+        batch_size, nh, sequence_length, dhqk = query.shape
+        if sequence_length % chunk_size != 0:
+            raise ValueError(f"Sequence length {sequence_length} is not divisible by chunk size {chunk_size}.")
+        nc = sequence_length // chunk_size
+
+        vecI = igate.view(batch_size, nh, nc, chunk_size)
+        vecF = fgate.view(batch_size, nh, nc, chunk_size)
+
+        # compute the gates, the g and the a and b vectors
+        vecF_logsig = F.logsigmoid(vecF)
+        vecB = vecF_logsig.cumsum(-1)
+
+        qk_scale = dhqk**-0.5
+
+        #! materialize the  C_k, n_k, m_k states for each chunk
+        matC_k_states, vecN_k_states, scaMinter_k_states = mlstm_chunkwise_recurrent_fw_C(
+            matK=key,
+            matV=value,
+            vecB=vecB,
+            vecI=vecI,
+            matC_initial=c_initial,
+            vecN_initial=n_initial,
+            scaMinter_initial=m_initial,
+            qk_scale=qk_scale,
+            chunk_size=chunk_size,
+            num_chunks=nc,
+        )
+
+        #! compute the outputs within each chunk
+        matH_out, vecN_out, vecM_out = mlstm_chunkwise_parallel_fw_H(
+            matQ=query,
+            matK=key,
+            matV=value,
+            matC_states=matC_k_states[:, :, :-dhqk, :],
+            vecN_states=vecN_k_states[:, :, :-dhqk],
+            scaMinter_states=scaMinter_k_states[:, :, :-1],
+            vecI=vecI,
+            vecB=vecB,
+            qk_scale=qk_scale,
+            chunk_size=chunk_size,
+            num_chunks=nc,
+            eps=eps,
+        )
+
+        last_states = (matC_k_states[:, :, -dhqk:, :], vecN_k_states[:, :, -dhqk:], scaMinter_k_states[:, :, -1:])
+
+        if return_last_states:
+            return matH_out, last_states
+        else:
+            return matH_out
+
+    def mlstm_recurrent_step_native(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        igate: torch.Tensor,
+        fgate: torch.Tensor,
+        cstate: torch.Tensor,
+        nstate: torch.Tensor,
+        mstate: torch.Tensor,
+        eps: float = 1e-6,
+        dtype_state: torch.dtype = torch.float32,
+        **kwargs,
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
+        """This is a single step of the mLSTM operation in recurrent form."""
+        dtype_qkv = query.dtype
+        matC_old = cstate.to(dtype=dtype_state)
+        vecN_old = nstate.to(dtype=dtype_state)
+        scaM_old = mstate.to(dtype=dtype_state)
+
+        batch_size, nh, dhqk = query.shape
+        _, _, dhhv = value.shape
+        if query.shape != key.shape:
+            raise ValueError("query and key must have the same shape")
+        if matC_old.shape != (batch_size, nh, dhqk, dhhv):
+            raise ValueError(f"matC_old has wrong shape, got {matC_old.shape}")
+        if vecN_old.shape != (batch_size, nh, dhqk):
+            raise ValueError(f"vecN_old has wrong shape, got {vecN_old.shape}")
+        if scaM_old.shape != (batch_size, nh, 1):
+            raise ValueError(f"scaM_old has wrong shape, got {scaM_old.shape}")
+        if igate.shape != (batch_size, nh, 1):
+            raise ValueError(f"scaI has wrong shape, got {igate.shape}")
+        if fgate.shape != (batch_size, nh, 1):
+            raise ValueError(f"scaF has wrong shape, got {fgate.shape}")
+
+        # gates
+        scaF_log = torch.nn.functional.logsigmoid(fgate)
+
+        # update rule
+        scaM_state_new = torch.max(scaF_log + scaM_old, igate)
+
+        scaF_act = torch.exp(scaF_log + scaM_old - scaM_state_new)
+        scaI_act = torch.exp(igate - scaM_state_new)
+
+        vecQ_scaled = query * (dhqk ** (-0.5))
+        matC_state_new = scaF_act[:, :, :, None] * matC_old + scaI_act[:, :, :, None] * (
+            key[:, :, :, None] @ value[:, :, None, :]
+        )
+        vecN_state_new = scaF_act * vecN_old + scaI_act * key
+        h_num = vecQ_scaled[:, :, None, :] @ matC_state_new.to(dtype=dtype_qkv)
+        h_num = h_num.squeeze(2).to(dtype=dtype_state)
+
+        qn_dotproduct = vecQ_scaled[:, :, None, :] @ vecN_state_new[:, :, :, None].to(dtype=dtype_qkv)
+        qn_dotproduct = qn_dotproduct.squeeze(2)
+        max_val = torch.exp(-scaM_state_new)
+        h_denom = (torch.maximum(qn_dotproduct.abs(), max_val) + eps).to(dtype=dtype_state)
+        h = h_num / h_denom
+
+        h = h.to(dtype=dtype_qkv)
+        matC_state_new = matC_state_new.to(dtype=dtype_state)
+        vecN_state_new = vecN_state_new.to(dtype=dtype_state)
+        scaM_state_new = scaM_state_new.to(dtype=dtype_state)
+        return h, (matC_state_new, vecN_state_new, scaM_state_new)
+
+    def mlstm_recurrent_sequence_native(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        igate: torch.Tensor,
+        fgate: torch.Tensor,
+        c_initial: Optional[torch.Tensor] = None,
+        n_initial: Optional[torch.Tensor] = None,
+        m_initial: Optional[torch.Tensor] = None,
+        return_last_states: bool = False,
+        eps: float = 1e-6,
+        dtype_state: torch.dtype = torch.float32,
+        **kwargs,
+    ) -> tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        Optional[tuple[torch.Tensor, torch.Tensor, torch.Tensor]],
+        Optional[tuple[torch.Tensor, torch.Tensor, torch.Tensor]],
+    ]:
+        batch_size, nh, sequence_length, dhqk = query.shape
+        dhv = value.shape[-1]
+        device = query.device
+
+        if c_initial is not None:
+            if n_initial is None or m_initial is None:
+                raise ValueError("Initial states must be provided together.")
+            if n_initial is None or m_initial is None:
+                raise ValueError("Initial states must be provided together.")
+            matC_state, vecN_state, vecM_state = (
+                c_initial.to(dtype=dtype_state),
+                n_initial.to(dtype=dtype_state),
+                m_initial.to(dtype=dtype_state),
+            )
+        else:
+            # memory state
+            matC_state = torch.zeros((batch_size, nh, dhqk, dhv), dtype=dtype_state, device=device)
+            # normalizer state
+            vecN_state = torch.zeros((batch_size, nh, dhqk), dtype=dtype_state, device=device)
+            # max state
+            vecM_state = torch.zeros((batch_size, nh, 1), dtype=dtype_state, device=device)
+
+        vecH_list = []
+        for t in range(sequence_length):
+            # gates
+            vecF_t, vecI_t = fgate[:, :, t, None], igate[:, :, t, None]
+
+            # projections
+            vecQ_t, vecK_t, vecV_t = query[:, :, t, :], key[:, :, t, :], value[:, :, t, :]
+
+            # step
+            vecH, (matC_state, vecN_state, vecM_state) = mlstm_recurrent_step_native(
+                cstate=matC_state,
+                nstate=vecN_state,
+                mstate=vecM_state,
+                query=vecQ_t,
+                key=vecK_t,
+                value=vecV_t,
+                igate=vecI_t,
+                fgate=vecF_t,
+                eps=eps,
+                dtype_state=dtype_state,
+                **kwargs,
+            )
+            vecH_list.append(vecH)
+
+        matH = torch.stack(vecH_list, dim=-2)
+
+        if return_last_states:
+            return matH, (matC_state, vecN_state, vecM_state)
+        else:
+            return matH
+
+    def wrap_chunkwise_pad_zeros(
+        mlstm_chunkwise_kernel: Callable,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        fgate: torch.Tensor,
+        igate: torch.Tensor,
+        c_initial: Optional[torch.Tensor] = None,
+        n_initial: Optional[torch.Tensor] = None,
+        m_initial: Optional[torch.Tensor] = None,
+        return_last_states: bool = False,
+        eps: float = 1e-6,
+        autocast_kernel_dtype: torch.dtype = torch.bfloat16,
+        chunk_size: int = 64,
+        **kwargs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]]:
+        if return_last_states:
+            raise ValueError(
+                "We are padding zeros, so we cannot return last states,",
+                "as they would be not the true last states.",
+            )
+
+        batch_size, nh, sequence_length, dhqk = query.shape
+        S_unpadded = sequence_length
+        # padding to chunk size for kernels
+        if sequence_length % chunk_size != 0:
+            S_padded = ((sequence_length + chunk_size - 1) // chunk_size) * chunk_size
+            q_pad = query.new_zeros(batch_size, nh, S_padded, query.shape[3])
+            k_pad = key.new_zeros(batch_size, nh, S_padded, key.shape[3])
+            v_pad = value.new_zeros(batch_size, nh, S_padded, value.shape[3])
+            i_pad = igate.new_zeros(batch_size, nh, S_padded)
+            f_pad = fgate.new_zeros(batch_size, nh, S_padded)
+            q_pad[:, :, :S_unpadded, :] = query
+            k_pad[:, :, :S_unpadded, :] = key
+            v_pad[:, :, :S_unpadded, :] = value
+            i_pad[:, :, :S_unpadded] = igate
+            f_pad[:, :, :S_unpadded] = fgate
+        else:
+            q_pad = query
+            k_pad = key
+            v_pad = value
+            i_pad = igate
+            f_pad = fgate
+
+        matH = mlstm_chunkwise_kernel(
+            query=q_pad,
+            key=k_pad,
+            value=v_pad,
+            igate=i_pad,
+            fgate=f_pad,
+            c_initial=c_initial,
+            n_initial=n_initial,
+            m_initial=m_initial,
+            return_last_states=return_last_states,
+            eps=eps,
+            autocast_kernel_dtype=autocast_kernel_dtype,
+            chunk_size=chunk_size,
+            **kwargs,
+        )
+        matH = matH[:, :, :S_unpadded, :]
+        return matH
+
+    def wrap_chunkwise_arbitrary_sequence_length(
+        mlstm_chunkwise_kernel: Callable,
+        mlstm_sequence_kernel: Callable,
+        mlstm_step_kernel: Callable,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        fgate: torch.Tensor,
+        igate: torch.Tensor,
+        c_initial: Optional[torch.Tensor] = None,
+        n_initial: Optional[torch.Tensor] = None,
+        m_initial: Optional[torch.Tensor] = None,
+        return_last_states: bool = True,
+        eps: float = 1e-6,
+        autocast_kernel_dtype: torch.dtype = torch.bfloat16,
+        chunk_size: int = 64,
+        enable_logging: bool = False,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]]:
+        """This function computes the last hidden state and matH outputs of the mLSTM, independently of the sequence length.
+
+        For this it uses three kernels:
+        - mlstm_chunkwise_kernel: mlstm chunkwise kernels that processes chunks of a given chunk size in parallel.
+        - mlstm_sequence_kernel: mlstm kernel that processes the remaining sequence length in a single step recurrence.
+        - mlstm_step_kernel: mlstm kernel that processes a sequence length of 1 in a single step.
+
+        It tries to maximize the chunksizes to improve performance.
+        It will start with the given chunk size and then divides the chunksize by 2 until the chunk size is smaller than 16.
+        At every chunksize it will process the maximal number of chunks that fit into the remaining sequence length.
+
+        E.g. for chunk_size = 64, this function will try the chunksizes [64, 32, 16] if necessary.
+
+        For the remaining sequence length, which is smaller than 16, we use a different kernel that computes the mLSTM
+        in a single step and loop over this in pytorch.
+
+        Args:
+            mlstm_chunkwise_kernel: The mLSTM chunkwise kernel that processes chunks of a given chunk size in parallel
+            mlstm_sequence_kernel: The mLSTM kernel that processes the remaining sequence length in a single step recurrence
+            query: The query tensor (batch_size, nh, sequence_length, dhqk)
+            key: The key tensor (batch_size, nh, sequence_length, dhqk)
+            value: The value tensor (batch_size, nh, sequence_length, dhhv)
+            fgate: The forget gate tensor (batch_size, nh, sequence_length)
+            igate: The input gate tensor (batch_size, nh, sequence_length)
+            c_initial: The initial cell state tensor (batch_size, nh, dhqk, dhhv)
+            n_initial: The initial hidden state tensor (batch_size, nh, dhqk)
+            m_initial: The initial memory state tensor (batch_size, nh, 1)
+            return_last_states: If True, the function will return the last states of the mLSTM
+            eps: The epsilon value used for numerical stability
+            autocast_kernel_dtype: The dtype used for the kernel computation
+            chunk_size: The chunk size used for the chunkwise kernel
+            enable_logging: If True, the function will log debug information. Default is False.
+
+        Returns:
+            The last hidden state tensor (batch_size, nh, sequence_length, dhhv) or a tuple containing the last hidden state tensor and the last states of the mLSTM
+            Last states are (cstate (batch_size, nh, dhqk, dhhv), nstate (batch_size, nh, dhqk), mstate (batch_size, nh, 1)).
+        """
+
+        batch_size, nh, sequence_length, dhqk = key.shape
+        dhhv = value.shape[-1]
+
+        c_state = (
+            c_initial
+            if c_initial is not None
+            else torch.zeros(batch_size, nh, dhqk, dhhv, device=key.device, dtype=torch.float32)
+        )
+        n_state = (
+            n_initial
+            if n_initial is not None
+            else torch.zeros(batch_size, nh, dhqk, device=key.device, dtype=torch.float32)
+        )
+        m_state = (
+            m_initial
+            if m_initial is not None
+            else torch.zeros(batch_size, nh, 1, device=key.device, dtype=torch.float32)
+        )
+
+        if sequence_length > 1:
+            # process the sequence length in chunks
+            h_outs = []
+            seq_len_start_idx = 0
+            remaining_seq_len = sequence_length - seq_len_start_idx
+            num_chunks = remaining_seq_len // chunk_size
+            if num_chunks > 0:
+                iter_seq_len = chunk_size * num_chunks
+                seq_len_idx = seq_len_start_idx + iter_seq_len
+                h_out, (c_state, n_state, m_state) = mlstm_chunkwise_kernel(
+                    query=query[..., seq_len_start_idx:seq_len_idx, :].contiguous(),
+                    key=key[..., seq_len_start_idx:seq_len_idx, :].contiguous(),
+                    value=value[..., seq_len_start_idx:seq_len_idx, :].contiguous(),
+                    fgate=fgate[..., seq_len_start_idx:seq_len_idx].contiguous(),
+                    igate=igate[..., seq_len_start_idx:seq_len_idx].contiguous(),
+                    c_initial=c_state,
+                    n_initial=n_state,
+                    m_initial=m_state,
+                    chunk_size=chunk_size,
+                    return_last_states=True,
+                    autocast_kernel_dtype=autocast_kernel_dtype,
+                    eps=eps,
+                )
+                seq_len_start_idx += iter_seq_len
+                h_outs.append(h_out)
+
+            remaining_seq_len = sequence_length - seq_len_start_idx
+
+            if remaining_seq_len > 0:
+                # we use here matK as query as this kernel does not need a query, since we do not care about the outputs only about the last state
+                h_out, (c_state, n_state, m_state) = mlstm_sequence_kernel(
+                    query=query[..., seq_len_start_idx:sequence_length, :].contiguous(),
+                    key=key[..., seq_len_start_idx:sequence_length, :].contiguous(),
+                    value=value[..., seq_len_start_idx:sequence_length, :].contiguous(),
+                    igate=igate[..., seq_len_start_idx:sequence_length].contiguous(),
+                    fgate=fgate[..., seq_len_start_idx:sequence_length].contiguous(),
+                    c_initial=c_state,
+                    n_initial=n_state,
+                    m_initial=m_state,
+                    return_last_states=True,
+                    eps=eps,
+                )
+                h_outs.append(h_out)
+            h_out = torch.concatenate(h_outs, dim=2)
+
+        else:
+            if sequence_length != 1:
+                raise ValueError(
+                    f"Received empty sequence (sequence_length={sequence_length}), require at least single element in the sequence."
+                )
+            # process the sequence length in a single step
+            # while this case is also captured by the regular mode above,
+            # it avoids the overhead of the loop and calls the step kernel directly
+            # The step function does not want a sequence dimension
+            # qkv shape is (batch_size, nh, dhqk/dhv)
+            # igate, fgate shape is (batch_size, nh, 1)
+            h_out, (c_state, n_state, m_state) = mlstm_step_kernel(
+                query=query.squeeze(2),
+                key=key.squeeze(2),
+                value=value.squeeze(2),
+                igate=igate,
+                fgate=fgate,
+                cstate=c_state,
+                nstate=n_state,
+                mstate=m_state,
+                eps=eps,
+            )
+            h_out = h_out[:, :, None, :]
+
+        if return_last_states:
+            return h_out, (c_state, n_state, m_state)
+        else:
+            return h_out
+
+    class xLSTMBackend(nn.Module):
+        """xLSTM Backend Module for PyTorch.
+
+        This module wraps the xLSTM kernels and provides a high-level interface for training and inference.
+        """
+
+        config_class = xLSTMConfig
+
+        def __init__(self, config: xLSTMConfig):
+            super().__init__()
+            self.config = config
+            self.chunkwise_kernel_fn = mlstm_chunkwise_native_autograd
+            self.sequence_kernel_fn = mlstm_recurrent_sequence_native
+            self.step_kernel_fn = mlstm_recurrent_step_native
+
+            self._inference_fn = partial(
+                wrap_chunkwise_arbitrary_sequence_length,
+                mlstm_chunkwise_kernel=self.chunkwise_kernel_fn,
+                mlstm_sequence_kernel=partial(
+                    self.sequence_kernel_fn,
+                    dtype_state=getattr(torch, config.inference_state_dtype),
+                ),
+                mlstm_step_kernel=partial(
+                    self.step_kernel_fn,
+                    dtype_state=getattr(torch, config.inference_state_dtype),
+                ),
+                chunk_size=config.chunk_size,
+                eps=config.eps,
+                autocast_kernel_dtype=getattr(torch, config.autocast_kernel_dtype),
+                return_last_states=True,
+            )
+
+            train_kernel_fn = partial(
+                self.chunkwise_kernel_fn,
+                autocast_kernel_dtype=getattr(torch, config.autocast_kernel_dtype),
+                eps=config.eps,
+                chunk_size=config.chunk_size,
+            )
+            if "with_padding" in config.mode:
+                train_kernel_fn = partial(wrap_chunkwise_pad_zeros, mlstm_chunkwise_kernel=train_kernel_fn)
+            self._train_fn = train_kernel_fn
+
+        def forward(
+            self,
+            query: torch.Tensor,
+            key: torch.Tensor,
+            value: torch.Tensor,
+            igate: torch.Tensor,
+            fgate: torch.Tensor,
+            c_initial: Optional[torch.Tensor] = None,
+            n_initial: Optional[torch.Tensor] = None,
+            m_initial: Optional[torch.Tensor] = None,
+            return_last_states: bool = False,
+            mode: Optional[Literal["train", "inference"]] = None,
+        ) -> Union[torch.Tensor, tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]]:
+            """Forward pass of the mLSTM backend.
+
+            Depending on the configured mode, this method will call the appropriate kernel function.
+
+            Args:
+                query: The query tensor of shape (batch_size, nh, sequence_length, dhqk).
+                key: The key tensor of shape (batch_size, nh, sequence_length, dhqk).
+                value: The value tensor of shape (batch_size, nh, sequence_length, dhhv).
+                igate: The input gate preactivation tensor of shape (batch_size, nh, sequence_length).
+                fgate: The forget gate preactivation tensor of shape (batch_size, nh, sequence_length).
+                c_initial: The initial cell state tensor of shape (batch_size, nh, dhqk, dhhv).
+                                                    Defaults to None.
+                n_initial: The initial hidden state tensor of shape (batch_size, nh, dhqk). Defaults to None.
+                m_initial: The initial memory tensor of shape (batch_size, nh, 1). Defaults to None.
+                return_last_states: Whether to return the last states of the sequence. Defaults to None.
+                                                    If None, the value from the config is used.
+
+            Returns:
+                hidden states of shape (batch_size, nh, sequence_length, dhhv)
+                hidden states and last states the last states are the cell state cstate (batch_size, nh, dhqk, dhhv),
+                the normalizer state nstate (batch_size, nh, dhqk), and the max state mstate (batch_size, nh, 1)
+            """
+            if mode is None:
+                mode = self.config.mode
+
+            if "train" in mode:
+                if return_last_states is None:
+                    return_last_states = self.config.return_last_states
+
+                if self.config.mode == "train_with_padding":
+                    if return_last_states:
+                        raise ValueError("return_last_states=True is not supported with train_with_padding mode.")
+
+                return self._train_fn(
+                    query=query,
+                    key=key,
+                    value=value,
+                    igate=igate,
+                    fgate=fgate,
+                    c_initial=c_initial,
+                    n_initial=n_initial,
+                    m_initial=m_initial,
+                    return_last_states=return_last_states,
+                )
+
+            elif "inference" in mode:
+                # inference mode always returns the last states
+                return self._inference_fn(
+                    query=query,
+                    key=key,
+                    value=value,
+                    igate=igate,
+                    fgate=fgate,
+                    c_initial=c_initial,
+                    n_initial=n_initial,
+                    m_initial=m_initial,
+                )
+            else:
+                raise ValueError(f"Unknown mode: {self.config.mode}")
+
+        def extra_repr(self) -> str:
+            return f"{self.config}"
+
+    class xLSTMRMSNorm(nn.Module):
+        """Root mean square normalization layer implementation similar
+        to https://pytorch.org/docs/stable/generated/torch.nn.RMSNorm.html.
+
+        It normalizes the input tensor by the root mean square of the last dimension.
+
+        Args:
+            num_features: The number of features in the input tensor.
+            eps: A small value to avoid division by zero.
+            use_weight: Whether to use a learnable weight.
+            use_bias: Whether to use a learnable bias.
+            force_float32_reductions: Whether to force float32 reductions.
+        """
+
+        def __init__(
+            self,
+            num_features: int,
+            eps: float = 1e-6,
+            use_weight: bool = True,
+            use_bias: bool = False,
+            force_float32_reductions: bool = True,
+        ):
+            super().__init__()
+            self.num_features = num_features
+            self.eps = eps
+            self.force_float32_reductions = force_float32_reductions
+
+            if use_weight:
+                self.weight = nn.Parameter(torch.ones(num_features))
+            else:
+                self.weight = None
+
+            if use_bias:
+                self.bias = nn.Parameter(torch.zeros(num_features))
+            else:
+                self.bias = None
+
+        def _apply_weight_bias(self, x: torch.Tensor) -> torch.Tensor:
+            if self.weight is not None:
+                x = x * self.weight
+            if self.bias is not None:
+                x = x + self.bias
+            return x
+
+        def _rms_normalize(self, x: torch.Tensor) -> torch.Tensor:
+            # apply rms norm over the last dimension, i.e. HD dimension
+            in_dtype = x.dtype
+            if self.force_float32_reductions:
+                x = x.float()
+            x = x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+            return x.to(in_dtype)
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            x = self._rms_normalize(x)
+            x = self._apply_weight_bias(x)
+            return x
+
+    class xLSTMMultiHeadLayerNorm(nn.Module):
+        """Multi-head version of the LayerNorm layer.
+
+        It normalizes the last dimension of the input tensor.
+
+        The input is assumed to have the shape (batch_size, sequence_length, nh, DH), where:
+        batch_size: batch size
+        sequence_length: sequence length
+        nh: number of heads
+        DH: head dimension
+
+        The normalization is applied over the last dimension (DH) of the input tensor.
+
+        Args:
+            num_heads: The number of heads.
+            head_dim: The head dimension.
+            eps: A small value to avoid division by zero.
+            use_weight: Whether to use a learnable weight.
+            use_bias: Whether to use a learnable bias.
+            force_float32_reductions: Whether to force float32 reductions
+
+        Returns:
+            The normalized tensor with the shape (batch_size, sequence_length, nh * DH).
+        """
+
+        def __init__(
+            self,
+            num_heads: int,
+            head_dim: int,
+            eps: float = 1e-6,
+            use_weight: bool = True,
+            use_bias: bool = False,
+            force_float32_reductions: bool = True,
+        ):
+            super().__init__()
+            self.num_features = num_heads * head_dim
+            self.eps = eps
+            self.force_float32_reductions = force_float32_reductions
+
+            if use_weight:
+                self.weight = nn.Parameter(torch.ones(self.num_features))
+            else:
+                self.weight = None
+
+            if use_bias:
+                self.bias = nn.Parameter(torch.zeros(self.num_features))
+            else:
+                self.bias = None
+            self.num_heads = num_heads
+            self.head_dim = head_dim
+
+        def _apply_weight_bias(self, x: torch.Tensor) -> torch.Tensor:
+            if self.weight is not None:
+                x = x * self.weight
+            if self.bias is not None:
+                x = x + self.bias
+            return x
+
+        def _layer_normalize(self, x: torch.Tensor) -> torch.Tensor:
+            # apply layer norm over the last dimension, i.e. HD dimension
+            in_dtype = x.dtype
+            if self.force_float32_reductions:
+                x = x.float()
+            x_centered = x - x.mean(dim=-1, keepdim=True)
+            y = x_centered * torch.rsqrt(x.var(dim=-1, keepdim=True, unbiased=False) + self.eps)
+            return y.to(in_dtype)
+
+        def forward(
+            self,
+            x: torch.Tensor,
+        ) -> torch.Tensor:
+            batch_size, sequence_length, nh, DH = x.shape
+            if nh != self.num_heads:
+                raise ValueError(f"Expected {self.num_heads} heads, got {nh}, input shape: {x.shape}")
+            if DH != self.head_dim:
+                raise ValueError(f"Expected {self.head_dim} head dimension, got {DH}, input shape: {x.shape}")
+
+            x = self._layer_normalize(x)
+            x = x.reshape(batch_size, sequence_length, -1)
+            x = self._apply_weight_bias(x)
+            return x
+
+    class xLSTMFeedForward(nn.Module):
+        def __init__(self, config: xLSTMConfig):
+            super().__init__()
+            self.config = config
+
+            self.up_proj_dim = round_up_to_next_multiple_of(
+                config.hidden_size * config.ffn_proj_factor,
+                config.ffn_round_up_to_multiple_of,
+            )
+
+            if self.config.weight_mode == "single":
+                self.proj_up_gate = nn.Linear(
+                    in_features=config.hidden_size,
+                    out_features=self.up_proj_dim,
+                    bias=self.config.use_bias,
+                )
+                self.proj_up = nn.Linear(
+                    in_features=config.hidden_size,
+                    out_features=self.up_proj_dim,
+                    bias=self.config.use_bias,
+                )
+            elif self.config.weight_mode == "fused":
+                self.proj_up_gate_z = nn.Linear(
+                    in_features=config.hidden_size,
+                    out_features=2 * self.up_proj_dim,
+                    bias=self.config.use_bias,
+                )
+
+            self.proj_down = nn.Linear(
+                in_features=self.up_proj_dim,
+                out_features=config.hidden_size,
+                bias=self.config.use_bias,
+            )
+
+            self.act_fn = nn.SiLU()
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            if self.config.weight_mode == "single":
+                x = self.act_fn(self.proj_up_gate(x)) * self.proj_up(x)
+            elif self.config.weight_mode == "fused":
+                x = self.proj_up_gate_z(x)
+                gate, z = torch.tensor_split(x, (self.up_proj_dim,), dim=-1)
+                x = self.act_fn(gate) * z
+
+            y = self.proj_down(x)
+            return y
+
+    class xLSTMLayer(nn.Module):
+        def __init__(self, config: xLSTMConfig):
+            super().__init__()
+            self.config = config
+
+            self.v_dim = int(config.hidden_size * config.v_dim_factor)
+            self.qk_dim = int(config.hidden_size * config.qk_dim_factor)
+
+            if self.config.weight_mode == "single":
+                self.q = nn.Linear(
+                    in_features=self.config.hidden_size,
+                    out_features=self.qk_dim,
+                    bias=self.config.use_bias,
+                )
+                self.k = nn.Linear(
+                    in_features=self.config.hidden_size,
+                    out_features=self.qk_dim,
+                    bias=self.config.use_bias,
+                )
+                self.v = nn.Linear(
+                    in_features=self.config.hidden_size,
+                    out_features=self.v_dim,
+                    bias=self.config.use_bias,
+                )
+
+                self.ogate_preact = nn.Linear(
+                    in_features=self.config.hidden_size,
+                    out_features=self.v_dim,
+                    bias=self.config.use_bias,
+                )
+                self.igate_preact = nn.Linear(
+                    in_features=self.config.hidden_size,
+                    out_features=self.config.num_heads,
+                    bias=True,
+                )
+                self.fgate_preact = nn.Linear(
+                    in_features=self.config.hidden_size,
+                    out_features=self.config.num_heads,
+                    bias=True,
+                )
+            elif self.config.weight_mode == "fused":
+                self.qkv_opreact = nn.Linear(
+                    in_features=self.config.hidden_size,
+                    out_features=2 * self.qk_dim + 2 * self.v_dim,
+                    bias=self.config.use_bias,
+                )
+                self.ifgate_preact = nn.Linear(
+                    in_features=self.config.hidden_size,
+                    out_features=2 * self.config.num_heads,
+                    bias=True,
+                )
+
+            self.ogate_act_fn = nn.Sigmoid()
+            self.mlstm_backend = xLSTMBackend(config=self.config)
+
+            self.multihead_norm = xLSTMMultiHeadLayerNorm(
+                num_heads=self.config.num_heads,
+                head_dim=self.v_dim // self.config.num_heads,
+                eps=self.config.norm_eps,
+                use_weight=True,
+                use_bias=self.config.use_bias,
+                force_float32_reductions=self.config.norm_reduction_force_float32,
+            )
+            self.out_proj = nn.Linear(
+                in_features=self.v_dim,
+                out_features=self.config.hidden_size,
+                bias=self.config.use_bias,
+            )
+
+        def forward(
+            self, x: torch.Tensor, state: Optional[mLSTMLayerStateType] = None
+        ) -> tuple[torch.Tensor, Optional[mLSTMLayerStateType]]:
+            if x.ndim != 3:
+                raise ValueError(f"Input must have shape [batch_size, sequence_length, HD], got {x.shape}")
+            batch_size, sequence_length, _ = x.shape
+            if self.config.weight_mode == "single":
+                query = self.q(x)
+                key = self.k(x)
+                value = self.v(x)
+                o_preact = self.ogate_preact(x)
+                i_preact = soft_cap(self.igate_preact(x), cap_value=self.config.gate_soft_cap)
+                f_preact = soft_cap(self.fgate_preact(x), cap_value=self.config.gate_soft_cap)
+
+            elif self.config.weight_mode == "fused":
+                qkv_opreact = self.qkv_opreact(x)
+                query, key, value, o_preact = torch.tensor_split(
+                    qkv_opreact,
+                    (
+                        self.qk_dim,
+                        2 * self.qk_dim,
+                        2 * self.qk_dim + self.v_dim,
+                    ),
+                    dim=-1,
+                )
+
+                if_preact = soft_cap(self.ifgate_preact(x), cap_value=self.config.gate_soft_cap)
+                i_preact, f_preact = torch.tensor_split(if_preact, (self.config.num_heads,), dim=-1)
+
+            query = query.reshape(batch_size, sequence_length, self.config.num_heads, -1).transpose(1, 2)
+            key = key.reshape(batch_size, sequence_length, self.config.num_heads, -1).transpose(1, 2)
+            value = value.reshape(batch_size, sequence_length, self.config.num_heads, -1).transpose(1, 2)
+            i_preact = i_preact.transpose(1, 2)
+            f_preact = f_preact.transpose(1, 2)
+            if state is None:
+                c_initial, n_initial, m_initial = None, None, None
+            else:
+                c_initial, n_initial, m_initial = state
+
+            h, state = self.mlstm_backend(
+                query=query,
+                key=key,
+                value=value,
+                igate=i_preact,
+                fgate=f_preact,
+                c_initial=c_initial,
+                n_initial=n_initial,
+                m_initial=m_initial,
+            )
+            expected_h_shape = (
+                batch_size,
+                self.config.num_heads,
+                sequence_length,
+                self.v_dim // self.config.num_heads,
+            )
+            if h.shape != expected_h_shape:
+                raise ValueError(f"Got {h.shape}, expected {expected_h_shape}")
+
+            h = h.transpose(1, 2)
+            h_norm = self.multihead_norm(h)
+            h_norm = h_norm.reshape(batch_size, sequence_length, -1)
+
+            h_out = self.ogate_act_fn(o_preact) * h_norm
+
+            y = self.out_proj(h_out)
+            return y, state
+
+    class xLSTMBlock(nn.Module):
+        def __init__(self, config: xLSTMConfig):
+            super().__init__()
+            self.config = config
+            self.norm_mlstm = xLSTMRMSNorm(
+                num_features=config.hidden_size,
+                eps=config.norm_eps,
+                use_weight=True,
+                use_bias=config.use_bias,
+                force_float32_reductions=config.norm_reduction_force_float32,
+            )
+            self.mlstm_layer = xLSTMLayer(config)
+            self.norm_ffn = xLSTMRMSNorm(
+                num_features=config.hidden_size,
+                eps=config.norm_eps,
+                use_weight=True,
+                use_bias=config.use_bias,
+                force_float32_reductions=config.norm_reduction_force_float32,
+            )
+            self.ffn = xLSTMFeedForward(config)
+
+        def forward(
+            self, x: torch.Tensor, state: Optional[mLSTMStateType] = None
+        ) -> tuple[torch.Tensor, mLSTMStateType]:
+            x_mlstm = self.norm_mlstm(x)
+            x_mlstm, state = self.mlstm_layer(x_mlstm, state)
+            x = x + x_mlstm
+
+            x_ffn = self.norm_ffn(x)
+            x_ffn = self.ffn(x_ffn)
+            x = x + x_ffn
+
+            return x, state
+
+
+def small_init_method(dim):
+    """
+    Adapted from: https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/init_functions.py
+    Fills the input Tensor with values according to the method described in Transformers without Tears: Improving
+    the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2019), using a normal distribution."""
+    std = (2 / (5 * dim)) ** (1 / 2)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+def wang_init_method(n_layers, dim):
+    """
+    Adapted from https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/init_functions.py
+    """
+    std = 2 / n_layers / dim ** (1 / 2)
+
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+
+class xLSTMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class for an interface to loading a pre-trained xLSTM model.
+    """
+
+    config_class = xLSTMConfig
+    base_model_prefix = "backbone"
+    _no_split_modules = ["xLSTMBlock"]
+    supports_gradient_checkpointing = True
+    _is_stateful = True
+
+    def _module_name_map(self, module):
+        for name, mod in self.named_modules():
+            if mod is module:
+                return name
+        return ""
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Embedding):
+            small_init_method(self.config.hidden_size)(self.embeddings.weight)
+        elif isinstance(module, nn.Linear):
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+            if self.config.weight_mode == "single" and "gate" in self._module_name_map(module):
+                torch.nn.init.zeros_(module.weight)
+                with torch.no_grad():
+                    if "igate" in self._module_name_map(module):
+                        module.bias.copy_(-10.0 * torch.ones_like(module.bias))
+                    elif "fgate" in self._module_name_map(module):
+                        module.bias.copy_(
+                            torch.linspace(
+                                3.0,
+                                6.0,
+                                module.bias.shape[-1],
+                            ).to(
+                                device=module.bias.device,
+                                dtype=module.bias.dtype,
+                            )
+                        )
+            elif self.config.weight_mode == "fused" and "gate" in self._module_name_map(module):
+                torch.nn.init.zeros_(module.weight)
+                with torch.no_grad():
+                    module.bias[: self.config.num_heads] += -module.bias[
+                        : self.config.num_heads
+                    ] - 10.0 * torch.ones_like(module.bias)
+                    module.bias[: self.config.num_heads] += -module.bias[self.config.num_heads :] + torch.linspace(
+                        3.0,
+                        6.0,
+                        module.bias.shape[-1],
+                    ).to(
+                        device=module.bias.device,
+                        dtype=module.bias.dtype,
+                    )
+            elif "proj_down" in self._module_name_map(module):
+                wang_init_method(dim=module.weight.shape[1], n_layers=self.config.num_hidden_layers)(module.weight)
+            elif "out_proj" in self._module_name_map(module):
+                wang_init_method(dim=self.config.hidden_size, n_layers=self.config.num_hidden_layers)(module.weight)
+            elif module.weight is not None:
+                small_init_method(self.config.hidden_size)(module.weight)
+        elif isinstance(module, xLSTMRMSNorm) or hasattr(module, "_layer_normalize"):
+            torch.nn.init.ones_(module.weight)
+            if hasattr(module, "bias") and module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+
+
+class xLSTMCache:
+    """
+    Cache for xLSTM model which does not have attention mechanism and key value states.
+
+    Arguments:
+        config (`PretrainedConfig):
+            The configuration file defining the shape-related attributes required to initialize the static cache.
+        max_batch_size (`int`):
+            The batch size with which the model will be used.
+        dtype (`torch.dtype`, *optional*, defaults to `torch.bfloat16`):
+            The default `dtype` to use when initializing the layer.
+        device (`torch.device` or `str`, *optional*):
+            The device on which the cache should be initialized. Should be the same as the layer.
+
+    Attributes:
+        seqlen_offset: int
+        dtype: torch.dtype
+
+    Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, xLSTMForCausalLM, xLSTMCache
+
+        >>> model = xLSTMForCausalLM.from_pretrained("NX-AI/xLSTM-7b")
+        >>> tokenizer = xLSTMTokenizer.from_pretrained("NX-AI/xLSTM-7b")
+
+        >>> inputs = tokenizer(text="I am an xLSTM", return_tensors="pt")
+
+        >>> # Prepare a cache class and pass it to model's forward
+        >>> cache_params = xLSTMCache(config=model.config, max_batch_size=1, device=model.device, dtype=model.dtype)
+        >>> outputs = model(**inputs, cache_params=cache_params, use_cache=True)
+        >>> outputs.cache_params
+        xLSTMCache()
+    """
+
+    def __init__(
+        self,
+        config: xLSTMConfig,
+        max_batch_size: int,
+        dtype: torch.dtype = torch.bfloat16,
+        device: Optional[str] = None,
+        **kwargs,
+    ):
+        self.seqlen_offset = 0
+        self.dtype = dtype
+        self.config = config
+        self.rnn_state = {
+            layer: (
+                torch.zeros(
+                    [max_batch_size, config.num_heads, config.qk_head_dim, config.v_head_dim],
+                    dtype=dtype,
+                    device=device,
+                ),
+                torch.zeros([max_batch_size, config.num_heads, config.qk_head_dim], dtype=dtype, device=device),
+                torch.zeros([max_batch_size, config.num_heads, 1], dtype=dtype, device=device),
+            )
+            for layer in range(config.num_hidden_layers)
+        }
+
+    def reset(self):
+        self.rnn_state = {
+            layer: (
+                torch.zeros_like(self.rnn_state[layer][0]),
+                torch.zeros_like(self.rnn_state[layer][1]),
+                torch.zeros_like(self.rnn_state[layer][2]),
+            )
+            for layer in self.rnn_state
+        }
+
+
+@dataclass
+@auto_docstring
+class xLSTMOutput(ModelOutput):
+    r"""
+    cache_params (`xLSTMCache`):
+        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+        avoid providing the old `input_ids`.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor]
+    cache_params: Optional[xLSTMCache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+@auto_docstring
+class xLSTMModel(xLSTMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        # use embbeding_dim and num_blocks once here to make use of them
+        self.embeddings = nn.Embedding(config.vocab_size, config.embedding_dim)
+        self.blocks = nn.ModuleList([xLSTMBlock(config) for _ in range(config.num_blocks)])
+        self.out_norm = xLSTMRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, new_embedding):
+        self.embeddings = new_embedding
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        cache_params: Optional[xLSTMCache] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, xLSTMOutput]:
+        r"""
+        cache_params (`xLSTMCache`, *optional*):
+            The xLSTMCache that carries the RNN states.
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
+        if self.gradient_checkpointing and self.training and use_cache:
+            use_cache = False
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+
+        if use_cache and cache_params is None:
+            cache_params = xLSTMCache(
+                self.config, inputs_embeds.size(0), device=inputs_embeds.device, dtype=inputs_embeds.dtype
+            )
+
+        hidden_states = inputs_embeds
+
+        if (
+            not self.training
+            and self.config.max_inference_chunksize < hidden_states.shape[1]
+            and not output_hidden_states
+        ):
+            offset = 0
+            with torch.no_grad():
+                if cache_params is None:
+                    cache_params = xLSTMCache(config=self.config, max_batch_size=hidden_states.shape[0])
+                final_state = torch.zeros_like(hidden_states)
+                while offset < hidden_states.shape[1]:
+                    hidden_states_chunk = hidden_states[
+                        :, offset : min(offset + self.config.max_inference_chunksize, hidden_states.shape[1])
+                    ]
+                    for layer_idx, xlstm_block in enumerate(self.blocks):
+                        hidden_states_chunk, rnn_state = xlstm_block(
+                            hidden_states_chunk,
+                            state=cache_params.rnn_state[layer_idx],
+                        )
+                        for state_idx in range(len(cache_params.rnn_state[layer_idx])):
+                            local_rnn_state = rnn_state[state_idx]
+                            cache_params.rnn_state[layer_idx][state_idx].copy_(local_rnn_state)
+                        cache_params.rnn_state_initial = False
+                    final_state[
+                        :, offset : min(offset + self.config.max_inference_chunksize, hidden_states.shape[1])
+                    ] = hidden_states_chunk
+                    offset += self.config.max_inference_chunksize
+                hidden_states = final_state
+        else:
+            all_hidden_states = () if output_hidden_states else None
+            for layer_idx, xlstm_block in enumerate(self.blocks):
+                if self.gradient_checkpointing and self.training:
+                    hidden_states, rnn_state = self._gradient_checkpointing_func(
+                        xlstm_block.__call__,
+                        hidden_states,
+                        cache_params.rnn_state[layer_idx] if cache_params is not None else None,
+                    )
+                else:
+                    hidden_states, rnn_state = xlstm_block(
+                        hidden_states,
+                        state=cache_params.rnn_state[layer_idx] if cache_params is not None else None,
+                    )
+                if cache_params:
+                    for state_idx in range(len(cache_params.rnn_state[layer_idx])):
+                        local_rnn_state = rnn_state[state_idx]
+                        cache_params.rnn_state[layer_idx][state_idx].copy_(local_rnn_state)
+                    cache_params.rnn_state_initial = False
+
+                if output_hidden_states:
+                    all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if use_cache:
+            cache_params.seqlen_offset += inputs_embeds.shape[1]
+
+        hidden_states = self.out_norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        return xLSTMOutput(
+            last_hidden_state=hidden_states,
+            cache_params=cache_params,
+            hidden_states=all_hidden_states,
+        )
+
+
+@dataclass
+@auto_docstring
+class xLSTMCausalLMOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    cache_params (`xLSTMCache`, *optional*, carrying the RNN states):
+        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+        avoid providing the old `input_ids`.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    cache_params: Optional[xLSTMCache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+@auto_docstring
+class xLSTMForCausalLM(xLSTMPreTrainedModel, GenerationMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        self.backbone = xLSTMModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def get_input_embeddings(self):
+        return self.backbone.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        return self.backbone.set_input_embeddings(new_embeddings)
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        attention_mask=None,  # not used but needed, otherwise generate complains when passing tokenizer inputs
+        inputs_embeds=None,
+        use_cache=None,
+        cache_params: Optional[xLSTMCache] = None,
+        **kwargs,
+    ):
+        if use_cache and cache_params is not None:
+            # If the first cache position is non-zero, we assume we are in generation mode.
+            # Thus, the cache_params state is assumed to be the state before the last token
+            # (lastly generated token), and all previous tokens are already ingested.
+            # This should as well support generation from scratch with the [BOS] token inserted first.
+            input_ids = input_ids[:, -1:]
+            if inputs_embeds is not None:
+                inputs_embeds = inputs_embeds[:, -1:]
+
+        if inputs_embeds is not None and cache_params is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update({"cache_params": cache_params, "use_cache": use_cache})
+
+        # Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
+        for key, value in kwargs.items():
+            if key not in model_inputs:
+                model_inputs[key] = value
+
+        return model_inputs
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_params: Optional[xLSTMCache] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, xLSTMCausalLMOutput]:
+        r"""
+        cache_params (`xLSTMCache`, *optional*):
+            The xLSTMCache that carries the RNN states.
+        """
+        xlstm_outputs = self.backbone(
+            input_ids,
+            cache_params=cache_params,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            **kwargs,
+        )
+        hidden_states = xlstm_outputs[0]
+
+        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
+
+        if not self.training and self.config.max_inference_chunksize < logits.shape[1]:
+            offset = 0
+            with torch.no_grad():
+                while offset < logits.shape[1]:
+                    logits[:, offset : min(offset + self.config.max_inference_chunksize, logits.shape[1])] = soft_cap(
+                        logits[:, offset : min(offset + self.config.max_inference_chunksize, logits.shape[1])],
+                        self.config.output_logit_soft_cap,
+                    )
+                    offset += self.config.max_inference_chunksize
+        else:
+            logits = soft_cap(logits, self.config.output_logit_soft_cap)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            # Shift so that tokens < nstate predict nstate
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        return xLSTMCausalLMOutput(
+            loss=loss,
+            logits=logits,
+            cache_params=xlstm_outputs.cache_params,
+            hidden_states=xlstm_outputs.hidden_states,
+        )
+
+
+__all__ = [
+    "xLSTMForCausalLM",
+    "xLSTMModel",
+    "xLSTMPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xmod/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xmod/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5113e54261d789b2c2a2621691f7674b8f3cece7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xmod/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xmod/__pycache__/configuration_xmod.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xmod/__pycache__/configuration_xmod.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..781f8a565acf89271a6870d04216433684b1816c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xmod/__pycache__/configuration_xmod.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xmod/__pycache__/modeling_xmod.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xmod/__pycache__/modeling_xmod.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0399bd3705864081c7146f4ccbc18f25892c1631
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xmod/__pycache__/modeling_xmod.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8c8b42d7b9265051cfb060b2c3504cf868175e8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/configuration_yolos.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/configuration_yolos.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2cc240d34a12d261989b6c631eafbaf2589a55b7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/configuration_yolos.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/feature_extraction_yolos.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/feature_extraction_yolos.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0df7d535ec21cb830359fd52bd879b7d780a6644
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/feature_extraction_yolos.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/image_processing_yolos.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/image_processing_yolos.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a77f5038a2dfde175ff42a62f8ce13a458dc53b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/image_processing_yolos.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/image_processing_yolos_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/image_processing_yolos_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b49d7b3ae1a9891684987385d0dcabdc132c3779
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/image_processing_yolos_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/modeling_yolos.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/modeling_yolos.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9364425c69a78955671f8a38c5b233c20e60bef4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/modeling_yolos.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/modular_yolos.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/modular_yolos.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..798cbea9a868133e2305f1b744ca59f4eaffc585
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yolos/__pycache__/modular_yolos.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yoso/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yoso/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71932b515da1d7e18fbd8dc57d8fc6fb83ea1844
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yoso/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yoso/__pycache__/configuration_yoso.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yoso/__pycache__/configuration_yoso.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a26e039527c81e4cbcdde68048497ea287278382
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yoso/__pycache__/configuration_yoso.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yoso/__pycache__/modeling_yoso.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yoso/__pycache__/modeling_yoso.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6f52a06cb624ba3cbb18af48385239981314a97
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/yoso/__pycache__/modeling_yoso.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..00db458c72ebd5513d8e8cf1b186f49886745a2a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_zamba2 import *
+    from .modeling_zamba2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/configuration_zamba2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/configuration_zamba2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2789e508f6fa7e4c79de9abefd0da150ec1aeb2e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/configuration_zamba2.py
@@ -0,0 +1,241 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/zamba2/modular_zamba2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_zamba2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+
+
+class Zamba2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Zamba2Model`]. It is used to instantiate a
+    Zamba2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Zamba2 model.
+
+    [Zyphra/Zamba2-2.7B](https://huggingface.co/Zyphra/Zamba2-2.7B)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Zamba2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Zamba2Model`]
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        hidden_size (`int`, *optional*, defaults to 2560):
+            Dimension of the hidden representations.
+        num_hidden_layers (`int`, *optional*, defaults to 54):
+            Number of hidden layers in the model.
+        layers_block_type (`list`, *optional*):
+            List of layer types, which can be either "mamba" or "hybrid".
+        mamba_d_state (`int`, *optional*, defaults to 64): shape of the state space latents.
+        mamba_d_conv (`int`, *optional*, defaults to 4): Size of the convolution kernel.
+        mamba_expand (`int`, *optional*, defaults to 2): Expanding factor used to determine the intermediate size.
+        mamba_ngroups (`int`, *optional*, defaults to 1):
+            Number of groups for the evolution matrices of mamba 2.
+        time_step_min (`float`, *optional*, defaults to 0.001):
+            Minimum `time_step` used to bound `dt_proj.bias`.
+        time_step_max (`float`, *optional*, defaults to 0.1):
+            Maximum `time_step` used to bound `dt_proj.bias`.
+        time_step_floor (`float`, *optional*, defaults to 0.0001):
+            Minimum clamping value of the `dt_proj.bias` layer initialization.
+        time_step_limit (`tuple`, *optional*):
+            Accepted range of time step values.
+        n_mamba_heads (`int`, *optional*, defaults to 8):
+            Number of heads for the evolution matrices of mamba 2.
+        use_conv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use bias in the convolution layer of the mixer block.
+        chunk_size (`int`, *optional*, defaults to 256):
+            Size of the chunks that will comprise the sequence.
+        use_mem_eff_path (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the fused conv1d and scan in mamba2 layers.
+        add_bias_linear (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in various layers
+        intermediate_size (`int`, *optional*, defaults to 4 * hidden_size):
+            Dimension of the MLP representations.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the MLP.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=None`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245).
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_mem_blocks (`int`, *optional*, defaults to 1):
+            Number of unshared transformer blocks.
+        use_shared_attention_adapter (`bool`, *optional*, defaults to `False`):
+            If True, unshared adapters (formally the same as LoRA but used in the base model) will be added to the q, k, v projectors in the shared attention layers.
+        adapter_rank (`int`, *optional*, defaults to 128):
+            Rank of the adapter in the shared MLP and shared attention layers.
+        use_mem_rope (`bool`, *optional*, defaults to `False`):
+            If True, includes RoPE in the shared attention layers.
+        rope_theta (`float`, *optional*, defaults to `10000.0`):
+            The base period of the RoPE embeddings.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
+            integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
+            logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
+            sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
+            significantly.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        use_long_context (`bool`, *optional*, defaults to `False`):
+            Activates the context-extended version of Zamba by modifying RoPE.
+    ```python
+    >>> from transformers import Zamba2Model, Zamba2Config
+    >>> # Initializing a Zamba2-2.7B style configuration
+    >>> configuration = Zamba2Config()
+    >>> # Initializing a model from the Zamba2-2.7B style configuration
+    >>> model = Zamba2Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "zamba2"
+    attribute_map = {"head_dim": "attention_head_dim"}
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        max_position_embeddings=4096,
+        hidden_size=2560,
+        num_hidden_layers=54,
+        layers_block_type=None,
+        mamba_d_state=64,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_ngroups=1,
+        time_step_min=0.001,
+        time_step_max=0.1,
+        time_step_floor=1e-4,
+        time_step_limit=None,
+        n_mamba_heads=8,
+        use_conv_bias=True,
+        chunk_size=256,
+        use_mem_eff_path=False,
+        add_bias_linear=False,
+        intermediate_size=None,
+        hidden_act="gelu",
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        attention_dropout=0.0,
+        num_mem_blocks=1,
+        use_shared_attention_adapter=False,
+        adapter_rank=128,
+        use_mem_rope=False,
+        rope_theta=10000,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        use_long_context=False,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        if intermediate_size is None:
+            self.intermediate_size = 4 * hidden_size
+        else:
+            self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_mem_blocks = num_mem_blocks
+        self.attention_hidden_size = 2 * hidden_size
+        self.attention_head_dim = 2 * self.hidden_size // self.num_attention_heads
+        self.attention_dropout = attention_dropout
+        self.use_mem_rope = use_mem_rope
+        self.use_long_context = use_long_context
+        if use_mem_rope and use_long_context:
+            a = 8
+            rope_theta = rope_theta * a ** (self.attention_head_dim / (self.attention_head_dim - 2))
+        self.rope_theta = rope_theta
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.add_bias_linear = add_bias_linear
+        self.mamba_ngroups = mamba_ngroups
+        self.n_mamba_heads = n_mamba_heads
+        self.mamba_headdim = int(mamba_expand * hidden_size) // n_mamba_heads
+        self.use_conv_bias = use_conv_bias
+        self.chunk_size = chunk_size
+        self.time_step_limit = time_step_limit
+        self.use_shared_attention_adapter = use_shared_attention_adapter
+        self.adapter_rank = adapter_rank
+        self.time_step_min = time_step_min
+        self.time_step_max = time_step_max
+        self.time_step_floor = time_step_floor
+        if use_long_context:
+            self.max_position_embeddings = 16384
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.num_attention_heads = num_attention_heads
+        self.kv_channels = self.hidden_size // self.num_attention_heads
+        self.num_query_groups = self.num_attention_heads
+        # Below, "mamba" stands for mamba layer, "hybrid" stands for hybrid layer (composed by a shared transformer followed by mamba layer)
+        if layers_block_type is None:
+            self.layers_block_type = (
+                ["mamba"]
+                + (["mamba"] * 5 + ["hybrid"]) * 7
+                + ["mamba"] * 4
+                + ["hybrid"]
+                + ["mamba"] * 3
+                + ["hybrid"]
+                + ["mamba"] * 2
+            )
+        else:
+            self.layers_block_type = layers_block_type
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+        self.hybrid_layer_ids = [index for index, type in enumerate(self.layers_block_type) if type == "hybrid"]
+        self.use_mem_eff_path = use_mem_eff_path
+
+
+__all__ = ["Zamba2Config"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/modeling_zamba2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/modeling_zamba2.py
new file mode 100644
index 0000000000000000000000000000000000000000..60e546f321209f38b1a02e984b98ddb8e6590dc1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/modeling_zamba2.py
@@ -0,0 +1,1738 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/zamba2/modular_zamba2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_zamba2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import re
+from itertools import cycle
+from typing import Any, Callable, Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available
+from .configuration_zamba2 import Zamba2Config
+
+
+if is_mamba_ssm_available():
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
+else:
+    selective_state_update, mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined = None, None, None
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+
+logger = logging.get_logger(__name__)
+
+
+class Zamba2RMSNormGated(torch.nn.Module):
+    def __init__(self, hidden_size, group_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.group_size = group_size
+
+    def forward(self, hidden_states, gate=None):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        if gate is not None:
+            hidden_states = hidden_states * nn.functional.silu(gate.to(torch.float32))
+        *prefix_dims, last_dim = hidden_states.shape
+        group_count = last_dim // self.group_size
+        hidden_states_group = hidden_states.view(*prefix_dims, group_count, self.group_size)
+        variance = hidden_states_group.pow(2).mean(-1, keepdim=True)
+        hidden_states_group = hidden_states_group * torch.rsqrt(variance + self.variance_epsilon)
+        hidden_states = hidden_states_group.view(*prefix_dims, group_count * self.group_size)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class Zamba2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Zamba2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Zamba2HybridDynamicCache:
+    """
+    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
+    (which has a constant shape regardless of seq_len).
+
+    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
+    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
+    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
+    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
+    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
+    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
+    """
+
+    is_compileable = False
+
+    def __init__(
+        self, config: Zamba2Config, batch_size: int, dtype: torch.dtype = torch.float16, device: Optional[str] = None
+    ):
+        self.dtype = dtype
+        self.layers_block_type = config.layers_block_type
+        self.has_previous_state = False
+        self.intermediate_size = int(config.mamba_expand * config.hidden_size)
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.n_mamba_heads = config.n_mamba_heads
+        self.transformer_layers = []
+        self._modules = {}
+        self._parameters = {}
+        self._buffers = {}
+        self.conv_states = {}
+        self.ssm_states = {}
+        for i in range(config.num_hidden_layers):
+            self.conv_states[i] = torch.zeros(
+                batch_size,
+                self.intermediate_size + 2 * config.mamba_ngroups * config.mamba_d_state,
+                self.conv_kernel_size,
+                device=device,
+                dtype=dtype,
+            )
+            self.ssm_states[i] = torch.zeros(
+                batch_size, self.n_mamba_heads, config.mamba_headdim, self.ssm_state_size, device=device, dtype=dtype
+            )
+            if self.layers_block_type[i] == "hybrid":
+                self.transformer_layers.append(i)
+        self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+        self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+
+    def __len__(self):
+        return len(self.key_cache)
+
+    def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[dict[str, Any]] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Update the cache
+        if self.key_cache[layer_idx].shape[-1] == 0:
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2)
+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+            device = self.conv_states[layer_idx].device
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.ssm_states[layer_idx].device
+            self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # take any layer that contains cache and not empty tensor
+        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+        if len(self.key_cache) <= layer_idx or self.key_cache[layer_idx].numel() == 0:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+
+    def update_conv_state(
+        self, layer_idx: int, new_conv_state: torch.Tensor, cache_position: torch.LongTensor
+    ) -> torch.Tensor:
+        conv_state = self.conv_states[layer_idx]
+        cache_position = cache_position.clamp(0, self.conv_kernel_size - 1)
+
+        conv_state = conv_state.roll(shifts=-1, dims=-1)
+        conv_state[:, :, cache_position] = new_conv_state.to(conv_state.device)
+        self.conv_states[layer_idx].zero_()
+        self.conv_states[layer_idx] += conv_state
+        return self.conv_states[layer_idx]
+
+    def reset(self):
+        self.conv_states.zero_()
+        self.ssm_states.zero_()
+
+
+class Zamba2RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Zamba2Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class Zamba2Attention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
+    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
+    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
+    (see fig. 2 in https://huggingface.co/papers/2405.16712).
+    Additionally, replaced
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
+    Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
+    layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
+    expressivity with a small memory overhead (see Fig. 2 of https://huggingface.co/papers/2411.15242).
+    """
+
+    def __init__(
+        self,
+        config: Zamba2Config,
+        layer_idx: Optional[int] = None,
+        num_fwd_mem_blocks: Optional[int] = None,
+        block_id: Optional[int] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+
+        self.attention_hidden_size = config.attention_hidden_size
+        self.head_dim = config.attention_head_dim
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.scaling = (self.head_dim / 2) ** -0.5
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        self.q_proj = nn.Linear(config.attention_hidden_size, config.num_attention_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.attention_hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.attention_hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+        self.num_fwd_mem_blocks = num_fwd_mem_blocks
+        self.layer_block_map = config.hybrid_layer_ids
+        self.block_id = block_id
+
+        if config.use_shared_attention_adapter:
+            self.linear_q_adapter_list = nn.ModuleList([])
+            self.linear_k_adapter_list = nn.ModuleList([])
+            self.linear_v_adapter_list = nn.ModuleList([])
+
+            for i in range(self.num_fwd_mem_blocks):
+                if i % config.num_mem_blocks == block_id:
+                    linear_q_adapter = nn.Sequential(
+                        nn.Linear(self.attention_hidden_size, self.config.adapter_rank, bias=False),
+                        nn.Linear(self.config.adapter_rank, self.attention_hidden_size, bias=False),
+                    )
+                    linear_k_adapter = nn.Sequential(
+                        nn.Linear(self.attention_hidden_size, self.config.adapter_rank, bias=False),
+                        nn.Linear(self.config.adapter_rank, self.attention_hidden_size, bias=False),
+                    )
+                    linear_v_adapter = nn.Sequential(
+                        nn.Linear(self.attention_hidden_size, self.config.adapter_rank, bias=False),
+                        nn.Linear(self.config.adapter_rank, self.attention_hidden_size, bias=False),
+                    )
+                else:
+                    linear_q_adapter = nn.Identity()
+                    linear_k_adapter = nn.Identity()
+                    linear_v_adapter = nn.Identity()
+                self.linear_q_adapter_list.append(linear_q_adapter)
+                self.linear_k_adapter_list.append(linear_k_adapter)
+                self.linear_v_adapter_list.append(linear_v_adapter)
+
+        self.layer_dic = {value: index for index, value in enumerate(self.layer_block_map)}
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Zamba2HybridDynamicCache] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        if self.config.use_shared_attention_adapter:
+            adapter_layer_idx = self.layer_dic[layer_idx]
+            query_states = query_states + self.linear_q_adapter_list[adapter_layer_idx](hidden_states)
+            key_states = key_states + self.linear_k_adapter_list[adapter_layer_idx](hidden_states)
+            value_states = value_states + self.linear_v_adapter_list[adapter_layer_idx](hidden_states)
+
+        query_states = query_states.view(hidden_shape).transpose(1, 2)
+        key_states = key_states.view(hidden_shape).transpose(1, 2)
+        value_states = value_states.view(hidden_shape).transpose(1, 2)
+
+        if self.config.use_mem_rope:
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(key_states, value_states, layer_idx)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+# Helper methods for segment sum computation
+
+
+def pad_tensor_by_size(input_tensor: torch.Tensor, pad_size: int):
+    """
+    Padding x tensor with `pad_size` on the seq_len dim (dim=1)
+
+    Assumes that we only have tensors of either size 4 or 3
+    """
+    pad_shape = (0, 0, 0, 0, 0, pad_size, 0, 0) if len(input_tensor.shape) == 4 else (0, 0, 0, pad_size, 0, 0)
+
+    return torch.nn.functional.pad(input_tensor, pad_shape, mode="constant", value=0)
+
+
+def reshape_into_chunks(input_tensor, pad_size, chunk_size):
+    """
+    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
+    simultaneously splitting it into chunk sequences.
+
+    Assumes that we only have tensors of either size 4 or 3
+    """
+    # [bsz, seq_len, ...] -> [bsz, seq_len multiple of chunk_size, ...]
+    input_tensor = pad_tensor_by_size(input_tensor, pad_size)
+
+    if len(input_tensor.shape) == 3:
+        # [bsz, seq_len multiple of chunk_size, num_heads] -> [bsz, -1, chunk_size, num_heads]
+        return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2])
+    else:
+        # [bsz, seq_len multiple of chunk_size, num_heads, head_dim or state_size] -> [bsz, -1, chunk_size, num_heads, head_dim or state_size]
+        return input_tensor.reshape(
+            input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3]
+        )
+
+
+def segment_sum(input_tensor):
+    """
+    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
+    """
+    chunk_size = input_tensor.size(-1)
+    # 1. expand input tensor to have an additional dimension and repeat along that dimension
+    # [..., chunk_size] -> [..., chunk_size, chunk_size]
+    input_tensor = input_tensor[..., None].expand(*input_tensor.size(), chunk_size)
+    # 2. create a lower triangular mask with the diagonal set to 0 to 0 out elements above diag
+    mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=-1)
+    input_tensor = input_tensor.masked_fill(~mask, 0)
+    # 3. compute actual cumsum
+    tensor_segsum = torch.cumsum(input_tensor, dim=-2)
+
+    # 4. apply mask to keep only the lower triangular part of the cumulative sum result (incl diagonal this time)
+    mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=0)
+    tensor_segsum = tensor_segsum.masked_fill(~mask, -torch.inf)
+    return tensor_segsum
+
+
+is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))
+
+
+class Zamba2MambaMixer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
+    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
+    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
+    and is why Mamba is called **selective** state spaces)
+    """
+
+    def __init__(self, config: Zamba2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = int(config.mamba_expand * self.hidden_size)
+        self.layer_idx = layer_idx
+        self.use_conv_bias = config.use_conv_bias
+        self.activation = "silu"
+        self.act = nn.SiLU()
+        self.use_mem_eff_path = config.use_mem_eff_path
+
+        self.n_groups = config.mamba_ngroups
+        self.head_dim = config.mamba_headdim
+        self.num_heads = self.config.n_mamba_heads
+        self.chunk_size = config.chunk_size
+
+        self.time_step_limit = config.time_step_limit
+        self.time_step_min = config.time_step_min
+        self.time_step_max = config.time_step_max
+
+        self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size
+        self.conv1d = nn.Conv1d(
+            in_channels=self.conv_dim,
+            out_channels=self.conv_dim,
+            bias=True,
+            kernel_size=config.mamba_d_conv,
+            groups=self.conv_dim,
+            padding=config.mamba_d_conv - 1,
+        )
+
+        # projection of the input hidden states
+        projection_size = self.intermediate_size + self.conv_dim + self.num_heads
+        self.in_proj = nn.Linear(
+            self.hidden_size,
+            projection_size,
+            bias=config.add_bias_linear,
+        )
+        # selective projection used to make dt, B and C input dependent
+
+        # time step projection (discretization)
+        # instantiate once and copy inv_dt in init_weights of PretrainedModel
+        self.dt_bias = nn.Parameter(torch.ones(self.num_heads))
+
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = torch.arange(1, self.num_heads + 1)
+        self.A_log = nn.Parameter(torch.log(A))
+        self.norm = Zamba2RMSNormGated(
+            self.intermediate_size, group_size=self.intermediate_size // self.n_groups, eps=1e-5
+        )
+        self.D = nn.Parameter(torch.ones(self.num_heads))
+
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.add_bias_linear)
+
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"
+                " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and"
+                " https://github.com/Dao-AILab/causal-conv1d"
+            )
+
+    def cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[Zamba2HybridDynamicCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # set up dimensions for reshapes later
+
+        batch_size, seq_len, _ = hidden_states.shape
+        groups_time_state_size = self.n_groups * self.ssm_state_size
+        d_to_remove = 2 * self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.num_heads
+
+        # getting projected states from cache if it exists
+        if cache_params is not None and cache_params.has_previous_state:
+            in_projected_states = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
+            d_mlp = (in_projected_states.shape[-1] - d_to_remove) // 2
+            split_projection_dim = [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads]
+            _, _, gate, hidden_states_B_C, dt = torch.split(in_projected_states, split_projection_dim, dim=-1)
+
+            hidden_states_B_C = causal_conv1d_update(
+                hidden_states_B_C,
+                cache_params.conv_states[self.layer_idx],
+                self.conv1d.weight.squeeze(1),
+                self.conv1d.bias,
+                self.activation,
+            )
+
+            hidden_states, B, C = torch.split(
+                hidden_states_B_C,
+                [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                dim=-1,
+            )
+            A = -torch.exp(self.A_log.float())  # (nheads,)
+
+            A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            dt = dt[:, :, None].expand(-1, -1, self.head_dim)
+            dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
+            D = self.D[:, None, ...].expand(-1, self.head_dim)
+            B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups)
+            C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups)
+            hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim)
+            hidden_states = selective_state_update(
+                cache_params.ssm_states[self.layer_idx],
+                hidden_states_reshaped,
+                dt,
+                A,
+                B,
+                C,
+                D,
+                z=None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+            hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
+            hidden_states = self.norm(hidden_states, gate)
+            out = self.out_proj(hidden_states)[:, None, ...]
+        # if no cache is found, calling the kernel
+        else:
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                dtype = hidden_states.dtype
+                hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+            # 1. Gated MLP's linear projection
+            projected_states = self.in_proj(hidden_states)
+            A = -torch.exp(self.A_log.float())  # (num_heads) or (intermediate_size, state_size)
+            dt_limit_kwargs = {} if self.time_step_limit is None else {"dt_limit": self.time_step_limit}
+            if attention_mask is not None:
+                input_not_masked = torch.all(attention_mask == 1)
+            else:
+                input_not_masked = True
+
+            if self.use_mem_eff_path and self.training and cache_params is None and input_not_masked:
+                out, ssm_state = mamba_split_conv1d_scan_combined(
+                    projected_states,
+                    self.conv1d.weight.squeeze(1),
+                    self.conv1d.bias,
+                    self.dt_bias,
+                    A,
+                    D=self.D,
+                    chunk_size=self.chunk_size,
+                    seq_idx=None,
+                    activation=self.activation,
+                    rmsnorm_weight=self.norm.weight,
+                    rmsnorm_eps=self.norm.variance_epsilon,
+                    outproj_weight=self.out_proj.weight,
+                    outproj_bias=self.out_proj.bias,
+                    headdim=self.head_dim,
+                    ngroups=self.n_groups,
+                    norm_before_gate=False,
+                    return_final_states=True,
+                    **dt_limit_kwargs,
+                )
+
+            else:
+                gate, hidden_states_B_C, time_step = torch.split(
+                    projected_states,
+                    [self.intermediate_size, self.conv_dim, self.num_heads],
+                    dim=-1,
+                )
+
+                # 1D Convolution
+                if cache_params is not None:
+                    hidden_states_B_C_t = hidden_states_B_C.transpose(1, 2)
+                    conv_state = nn.functional.pad(
+                        hidden_states_B_C_t, (self.conv_kernel_size - hidden_states_B_C_t.shape[-1], 0)
+                    )
+                    cache_params.conv_states[self.layer_idx].copy_(conv_state)
+                if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]:
+                    hidden_states_B_C = self.act(
+                        self.conv1d(hidden_states_B_C.transpose(1, 2)).transpose(1, 2)[:, :seq_len]
+                    )  # (B, L, self.d_inner + 2 * ngroups * d_state)
+                else:
+                    hidden_states_B_C = causal_conv1d_fn(
+                        x=hidden_states_B_C.transpose(1, 2),
+                        weight=self.conv1d.weight.squeeze(1),
+                        bias=self.conv1d.bias,
+                        activation=self.activation,
+                    ).transpose(1, 2)[:, :seq_len]
+                hidden_states, B, C = torch.split(
+                    hidden_states_B_C,
+                    [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                    dim=-1,
+                )
+                if attention_mask is not None and not torch.all(attention_mask == 1):
+                    # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                    dtype = hidden_states.dtype
+                    hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+                scan_output, ssm_state = mamba_chunk_scan_combined(
+                    hidden_states.view(batch_size, seq_len, -1, self.head_dim),
+                    time_step,
+                    A,
+                    B.view(batch_size, seq_len, self.n_groups, -1),
+                    C.view(batch_size, seq_len, self.n_groups, -1),
+                    chunk_size=self.chunk_size,
+                    D=self.D,
+                    z=None,
+                    seq_idx=None,
+                    return_final_states=True,
+                    dt_bias=self.dt_bias,
+                    dt_softplus=True,
+                    **dt_limit_kwargs,
+                )
+                if ssm_state is not None and cache_params is not None:
+                    cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+                scan_output = scan_output.view(batch_size, seq_len, -1)
+                # Multiply "gate" branch and apply extra normalization layer
+                scan_output = self.norm(scan_output, gate)
+                out = self.out_proj(scan_output)
+        return out
+
+    # fmt: off
+    def torch_forward(self, input_states, cache_params: Optional[Zamba2HybridDynamicCache]=None, attention_mask: Optional[torch.Tensor]=None):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+        # Gated MLP's linear projection
+        if cache_params is not None and cache_params.has_previous_state:
+            projected_states = self.in_proj(input_states.squeeze(1))
+        else:
+            if attention_mask is not None and not torch.all(attention_mask==1):
+                # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                input_states = (input_states * attention_mask[:, :, None]).to(dtype)
+            projected_states = self.in_proj(input_states)
+        d_mlp = (projected_states.shape[-1] - 2 * self.intermediate_size - 2 * self.n_groups * self.ssm_state_size- self.num_heads) // 2
+        _, _, gate, hidden_states, dt = projected_states.split(
+                [d_mlp, d_mlp, self.intermediate_size,  self.conv_dim, self.num_heads], dim=-1
+        )
+
+        # Convolution sequence transformation
+        if cache_params is not None:
+            ssm_state = cache_params.ssm_states[self.layer_idx].clone()
+            ssm_state = ssm_state.to(hidden_states.device)
+            if cache_params.has_previous_state:
+                gate = gate.unsqueeze(1)
+                conv_state = cache_params.conv_states[self.layer_idx]                   # [batch, intermediate_size, conv_kernel_size]
+                conv_state = torch.roll(conv_state, shifts=-1, dims=-1)
+                # handle batched generation - states are copied through
+                conv_state[:, :, -1] = hidden_states[:, 0, :] if hidden_states.ndim == 3 else hidden_states
+                cache_params.conv_states[self.layer_idx].copy_(conv_state)
+                hidden_states = torch.sum(conv_state.to(projected_states.device) * self.conv1d.weight[:, 0, :], dim=-1)
+                if self.use_conv_bias:
+                    hidden_states += self.conv1d.bias
+                hidden_states = self.act(hidden_states).to(dtype)[:, None, ...]         # [batch, 1, intermediate_size] : decoding
+            else:
+                hidden_states = hidden_states.transpose(1,2)
+                conv_state = nn.functional.pad(
+                    hidden_states,
+                    (self.conv_kernel_size - hidden_states.shape[-1], 0)
+                )
+                cache_params.conv_states[self.layer_idx].copy_(conv_state)
+                hidden_states = self.act(self.conv1d(hidden_states).transpose(1,2))[:, :seq_len, :]     # [batch, intermediate_size, seq_len]
+                if attention_mask is not None and not torch.all(attention_mask==1):
+                    dtype = hidden_states.dtype
+                    # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                    hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+        else:
+            ssm_state = torch.zeros(
+                (batch_size, self.num_heads, self.head_dim, self.ssm_state_size),
+                device=hidden_states.device, dtype=dtype
+            )
+            hidden_states = self.act(self.conv1d(hidden_states.transpose(1, 2))[..., :seq_len].transpose(1, 2))
+        hidden_states, B, C = torch.split(hidden_states, [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], dim=-1)
+        A = -torch.exp(self.A_log.float())                            # [num_heads]
+        if cache_params is not None and cache_params.has_previous_state:
+            # Note: there is no need to pad parameter matrices here, as there is just one new token
+            # for batched generation
+            dt = dt[:, None, ...] if dt.ndim == 2 else dt[:, 0, :][:, None, ...]
+            dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim)
+            # [num_heads] -> [num_heads, head_dim]
+            dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim)
+
+            dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype))
+            dt = torch.clamp(dt, self.time_step_min) #, self.time_step_max)
+            A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            # [bsz, num_heads, head_dim, state_size]
+            dA = torch.exp(dt[..., None] * A)
+
+            # Discretize B
+            # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] ->
+            # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size]
+            B = B.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous()
+            B = B.reshape(batch_size, -1, B.shape[-1])
+            # [bsz, num_heads, head_dim, state_size]
+            dB = dt[..., None] * B[..., None, :]
+
+            # Discretize x into dB
+            # [bsz, intermediate_size] -> [bsz, num_heads, head_dim]
+            hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim)
+            dBx = dB * hidden_states[..., None]
+
+            # State calculation
+            cache_params.ssm_states[self.layer_idx].copy_(
+                cache_params.ssm_states[self.layer_idx] * dA + dBx
+            )
+
+            # Subsequent output
+            # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size]
+            C = C.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous()
+            C = C.reshape(batch_size, -1, C.shape[-1])
+            # [bsz, num_heads, head_dim]
+
+            ssm_states = cache_params.ssm_states[self.layer_idx].to(C.dtype)  # Shape: [b, h, d, n]
+            # Reshape ssm_states to merge the first two dimensions
+            ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size)  # Shape: [b*h, d, n]
+            C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1)  # Shape: [b*h, n, 1]
+            y = torch.bmm(ssm_states_reshaped, C_reshaped)
+            y = y.view(batch_size, self.num_heads, self.head_dim)
+
+            # D skip connection
+            # [num_heads] -> [num_heads, head_dim]
+            D = self.D[..., None].expand(self.D.shape[0], self.head_dim)
+            y = (y + hidden_states * D).to(y.dtype)
+
+            # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size]
+            y = y.reshape(batch_size, -1)[:, None, ...]
+        else:
+            # begin ssd naive implementation without einsums
+            dt = nn.functional.softplus(dt + self.dt_bias)
+            dt = torch.clamp(dt, self.time_step_min)
+            hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
+            B = B.reshape(batch_size, seq_len,  -1, self.ssm_state_size).float()
+            C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
+            B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
+
+            D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)
+
+            # Discretize x and A
+            hidden_states = hidden_states * dt[..., None]
+            A = A.to(hidden_states.dtype) * dt
+
+            # Rearrange into blocks/chunks
+            hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)]
+
+
+            # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size]
+            A = A.permute(0, 3, 1, 2)
+            A_cumsum = torch.cumsum(A, dim=-1)
+
+            # 1. Compute the output for each intra-chunk (diagonal blocks)
+            # This is the analog of a causal mask
+            L = torch.exp(segment_sum(A))
+
+            # First, contraction of C and B to get G (attention-weights like)
+            G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, : ,:]  # shape: (b, c, l, s, h, n)
+            G = G_intermediate.sum(dim=-1)  # shape: (b, c, l, s, h)
+
+
+            # Step 2: Compute M, equivalent to applying attention mask to weights
+            M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None]
+            M = M_intermediate.sum(dim=-1)
+
+            # Step 3: Compute Y_diag (apply to values)
+            Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(3)
+
+            # (right term of low-rank factorization of off-diagonal blocks; B terms)
+
+            decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum)
+            B_decay_contraction = B * decay_states.permute(0, 2, 3, 1)[..., None]
+            # permute back B * decay states
+            states = (B_decay_contraction.permute(0, 1, 3, 2, 4)[..., None]  * hidden_states.permute(0, 1, 3, 2, 4)[..., None, :]).sum(dim=3).permute(0, 1, 2, 4, 3)
+            if cache_params is not None and cache_params.has_previous_state:
+                previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...]
+            else:
+                previous_states = torch.zeros_like(states[:, :1])
+            states = torch.cat([previous_states, states], dim=1)
+            decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0))))
+
+            states_permuted = states.permute(0, 2, 1, 3, 4)
+            result = (decay_chunk[..., None, None] * states_permuted[:, :, None, ...]).sum(dim=2)
+            new_states = result.permute(0, 2, 1, 3, 4)
+            states, ssm_state = new_states[:, :-1], new_states[:, -1]
+
+            # Compute state -> output conversion per chunk
+            # (left term of low-rank factorization of off-diagonal blocks; C terms)
+            state_decay_out = torch.exp(A_cumsum)
+            # compute Yoff
+            C_times_states = (C[..., None, :] * states[:, :, None, ...])
+            state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1)
+            Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None])
+            # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks)
+
+            y = Y_diag + Y_off
+            # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim]
+            y = y.reshape(batch_size, -1, self.num_heads, self.head_dim)
+
+            y = y + D_residual
+            # Cutting off padded chunks
+            if pad_size > 0:
+                y = y[:, :seq_len, :, :]
+            y = y.reshape(batch_size, seq_len, -1)
+            if ssm_state is not None and cache_params is not None:
+                cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+
+        scan_output = self.norm(y, gate)
+
+        # end ssd naive
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_output.to(dtype))  # [batch, seq_len, hidden_size]
+        return contextualized_states
+    # fmt: on
+
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[Zamba2HybridDynamicCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        if is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
+            return self.cuda_kernels_forward(hidden_states, cache_params, attention_mask)
+
+        return self.torch_forward(hidden_states, cache_params, attention_mask)
+
+
+class Zamba2MLP(nn.Module):
+    def __init__(self, config: Zamba2Config, num_fwd_mem_blocks=None, block_id: Optional[int] = None):
+        """
+        This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
+        is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
+        """
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.num_fwd_mem_blocks = num_fwd_mem_blocks
+        self.block_id = block_id
+
+        self.gate_up_proj = nn.Linear(self.hidden_size, 2 * self.intermediate_size, bias=config.add_bias_linear)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.add_bias_linear)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+        self.gate_up_proj_adapter_list = nn.ModuleList([])
+        for i in range(self.num_fwd_mem_blocks):
+            if i % config.num_mem_blocks == block_id:
+                gate_up_proj_adapter = nn.Sequential(
+                    nn.Linear(self.config.hidden_size, self.config.adapter_rank, bias=False),
+                    nn.Linear(self.config.adapter_rank, 2 * self.intermediate_size, bias=False),
+                )
+            else:
+                gate_up_proj_adapter = nn.Identity()
+            self.gate_up_proj_adapter_list.append(gate_up_proj_adapter)
+
+        layer_block_map = config.hybrid_layer_ids
+        self.layer_dic = {value: index for index, value in enumerate(layer_block_map)}
+
+    def forward(self, hidden_state, layer_idx=None):
+        gate_up_state = self.gate_up_proj(hidden_state)
+        layer_idx = self.layer_dic[layer_idx]
+        gate_up_state = gate_up_state + self.gate_up_proj_adapter_list[layer_idx](hidden_state)
+
+        gate_up_state = torch.chunk(gate_up_state, 2, dim=-1)
+        hidden_state = self.act_fn(gate_up_state[0]) * gate_up_state[1]
+        output = self.down_proj(hidden_state)
+        return output
+
+
+class Zamba2AttentionDecoderLayer(nn.Module):
+    def __init__(self, config: Zamba2Config, block_id: Optional[int] = None, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.block_id = block_id
+        num_gs = len(config.hybrid_layer_ids)
+        self.self_attn = Zamba2Attention(config, layer_idx=-1, num_fwd_mem_blocks=num_gs, block_id=block_id)
+        self.feed_forward = Zamba2MLP(config, num_fwd_mem_blocks=num_gs, block_id=block_id)
+        self.input_layernorm = Zamba2RMSNorm(config.attention_hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = Zamba2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Zamba2HybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        position_embeddings: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
+            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
+                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
+                concatenated tensor is then used as input of the pre-attention RMSNorm
+                (see fig. 2 in https://huggingface.co/papers/2405.16712).
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+        """
+        hidden_states = torch.concatenate([hidden_states, original_hidden_states], dim=-1)
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            layer_idx=layer_idx,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states, layer_idx)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class Zamba2MambaDecoderLayer(nn.Module):
+    def __init__(self, config: Zamba2Config, layer_idx: int):
+        super().__init__()
+        self.mamba = Zamba2MambaMixer(config=config, layer_idx=layer_idx)
+        self.input_layernorm = Zamba2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layer_idx = layer_idx
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: Optional[torch.Tensor] = None,
+        layer_idx: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Zamba2HybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        transformer_hidden_states: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+
+        residual = hidden_states
+
+        # `transformer_hidden_states` is the output from shared transformer + linear layer (see fig. 2 in https://huggingface.co/papers/2405.16712).
+        # `transformer_hidden_states` is then added to the input to the mamba layer below (as described in eq. (6) of https://huggingface.co/papers/2405.16712).
+        hidden_states = (
+            hidden_states + transformer_hidden_states if transformer_hidden_states is not None else hidden_states
+        )
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.mamba(
+            hidden_states=hidden_states,
+            cache_params=past_key_values,
+            attention_mask=attention_mask,
+        )
+
+        self_attn_weights = None
+
+        # residual connection after mamba
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_values,)
+
+        return outputs
+
+
+class Zamba2HybridLayer(nn.Module):
+    def __init__(
+        self, shared_transformer: Zamba2AttentionDecoderLayer, linear: nn.Linear, mamba: Zamba2MambaDecoderLayer
+    ):
+        super().__init__()
+        self.linear = linear
+        self.mamba_decoder = mamba
+        self.shared_transformer = shared_transformer
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: Optional[torch.Tensor] = None,
+        layer_idx: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Zamba2HybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        position_embeddings: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
+            hidden activations to form the input of the shared transformer layer.
+            layer_idx (`int`): layer number.
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+        """
+
+        layer_outputs = self.shared_transformer(
+            hidden_states,
+            original_hidden_states=original_hidden_states,
+            layer_idx=layer_idx,
+            attention_mask=causal_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            position_embeddings=position_embeddings,
+        )
+
+        transformer_hidden_states = layer_outputs[0]
+
+        if output_attentions:
+            self_attn_weights = layer_outputs[1]
+
+        transformer_hidden_states = self.linear(transformer_hidden_states)
+
+        layer_outputs = self.mamba_decoder(
+            hidden_states,
+            transformer_hidden_states=transformer_hidden_states,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            position_embeddings=position_embeddings,
+        )
+
+        if output_attentions:
+            layer_outputs = (layer_outputs[0], self_attn_weights) + layer_outputs[2:]
+
+        return layer_outputs
+
+
+class Zamba2PreTrainedModel(PreTrainedModel):
+    config: Zamba2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Zamba2AttentionDecoderLayer", "Zamba2MambaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_sdpa = True
+    # Note: only supports Zamba2HybridDynamicCache
+    _is_stateful = True
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, Zamba2MambaMixer):
+            dt = torch.exp(
+                torch.rand(self.config.n_mamba_heads)
+                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                + math.log(self.config.time_step_min)
+            ).clamp(min=self.config.time_step_floor)
+            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            module.dt_bias.data.copy_(inv_dt)
+
+            A = torch.arange(1, module.num_heads + 1)
+            module.A_log.data.copy_(torch.log(A))
+            module.D.data.fill_(1.0)
+
+
+@auto_docstring
+class Zamba2Model(Zamba2PreTrainedModel):
+    """
+    Model consisting of *config.num_hidden_layers* layers.
+
+    Args:
+        config: Zamba2Config
+    """
+
+    def __init__(self, config: Zamba2Config):
+        super().__init__(config)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        blocks = [Zamba2AttentionDecoderLayer(config, block_id=k) for k in range(config.num_mem_blocks)]
+        mamba_layers = []
+        linear_layers = []
+        self.layers_block_type = config.layers_block_type
+        for i in range(config.num_hidden_layers):
+            if config.layers_block_type[i] == "mamba":
+                mamba_layers.append(Zamba2MambaDecoderLayer(config, layer_idx=i))
+            elif config.layers_block_type[i] == "hybrid":
+                linear_layers.append(nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=False))
+                mamba_layers.append(Zamba2MambaDecoderLayer(config, layer_idx=i))
+        mamba_layers = iter(mamba_layers)
+        linear_layers = iter(linear_layers)
+        blocks = cycle(blocks)
+        layers = self.get_layers(blocks, linear_layers, mamba_layers)
+        self.layers = nn.ModuleList(layers)
+
+        self._attn_implementation = config._attn_implementation
+        self.final_layernorm = Zamba2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if config.use_mem_rope:
+            if config.use_long_context:
+                logger.warning_once(
+                    "`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`."
+                )
+            self.rotary_emb = Zamba2RotaryEmbedding(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Zamba2HybridDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+
+        original_hidden_states = torch.clone(inputs_embeds)
+        # original_hidden_states: word embedding output that will be concatenated with hidden activations to form the input of the shared transformer layer
+
+        if use_cache and past_key_values is None:
+            batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
+            past_key_values = Zamba2HybridDynamicCache(self.config, batch_size, dtype=self.dtype, device=self.device)
+
+        if cache_position is None:
+            past_seen_tokens = (
+                past_key_values.get_seq_length(layer_idx=self.first_transformer_layer_id)
+                if past_key_values is not None
+                else 0
+            )
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
+
+        # create position embeddings to be shared across the decoder layers
+        if self.config.use_mem_rope:
+            position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        else:
+            position_embeddings = None
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for layer_idx, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer.__call__,
+                    hidden_states,
+                    original_hidden_states,
+                    layer_idx,
+                    attention_mask,
+                    causal_mask,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = layer(
+                    hidden_states,
+                    original_hidden_states=original_hidden_states,
+                    layer_idx=layer_idx,
+                    attention_mask=attention_mask,
+                    causal_mask=causal_mask,
+                    past_key_values=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                if layer_outputs[1] is not None:
+                    # append attentions only of attention layers. Mamba layers return `None` as the attention weights
+                    all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if past_key_values is not None and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+        return output if return_dict else output.to_tuple()
+
+    def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        target_length = cache_position[-1] + 1
+
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            if attention_mask.dim() == 2:
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    def get_layers(self, blocks, linear_layers, mamba_layers):
+        layers = []
+        self._tied_weights_keys = []
+        self.first_transformer_layer_id = 0
+        for layer_id, layer_type in enumerate(self.layers_block_type):
+            if layer_type == "hybrid":
+                if self.first_transformer_layer_id == 0:
+                    self.first_transformer_layer_id = layer_id
+                block = next(blocks)
+                if self.config.num_mem_blocks * len(self.config.hybrid_layer_ids) > 1:
+                    prefix_pattern = rf"^layers\.{layer_id}\.shared_transformer\."
+                    main_keys_pattern = re.compile(
+                        prefix_pattern
+                        + r"(?:"
+                        + r"self_attn\.(?:q_proj|k_proj|v_proj|o_proj)\.weight|"
+                        + r"feed_forward\.(?:gate_up_proj|down_proj)\.weight|"
+                        + r"(?:input_layernorm|pre_ff_layernorm)\.weight"
+                        + r")$"
+                    )
+                    self._tied_weights_keys.append(main_keys_pattern)
+
+                    adapter_id = 0
+                    for _layer_type in self.layers_block_type:
+                        if _layer_type == "hybrid" and adapter_id % self.config.num_mem_blocks == block.block_id:
+                            adapter_pattern = re.compile(
+                                r"^shared_transformer\.feed_forward\.gate_up_proj_adapter_list\."
+                                + str(adapter_id)
+                                + r"\.(?:0|1)\.weight$"
+                            )
+                            self._tied_weights_keys.append(adapter_pattern)
+                        adapter_id += 1
+                    if self.config.use_shared_attention_adapter:
+                        adapter_id = 0
+                        for _layer_type in self.layers_block_type:
+                            if _layer_type == "hybrid" and adapter_id % self.config.num_mem_blocks == block.block_id:
+                                attn_adapter_pattern = re.compile(
+                                    r"^shared_transformer\.self_attn\."
+                                    + r"(?:linear_q_adapter_list|linear_k_adapter_list|linear_v_adapter_list)\."
+                                    + str(adapter_id)
+                                    + r"\.(?:0|1)\.weight$"
+                                )
+                                self._tied_weights_keys.append(attn_adapter_pattern)
+                            adapter_id += 1
+                layers.append(Zamba2HybridLayer(block, next(linear_layers), next(mamba_layers)))
+            else:
+                layers.append(next(mamba_layers))
+        return layers
+
+
+# Adapted from transformers.models.jamba.modeling_jamba.JambaForCausalLM with Jamba->Zamba2, JAMBA->ZAMBA2
+class Zamba2ForCausalLM(Zamba2PreTrainedModel, GenerationMixin):
+    def __init__(self, config: Zamba2Config):
+        super().__init__(config)
+        self.model = Zamba2Model(config)
+        self._tied_weights_keys = ["lm_head.weight", *self.model._tied_weights_keys]
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Zamba2HybridDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Zamba2ForCausalLM
+
+        >>> model = Zamba2ForCausalLM.from_pretrained("Zyphra/Zamba2-7B-v1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-7B-v1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        # Overwritten -- has a unique cache type, `Zamba2HybridDynamicCache`
+
+        empty_past_kv = past_key_values is None
+
+        # Omit tokens covered by past_key_values
+        if not empty_past_kv:
+            # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+            # Exception 1: when passing input_embeds, input_ids may be missing entries
+            # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+            # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+            #              (we can't check exception 3 while compiling)
+            if (
+                inputs_embeds is not None  # Exception 1
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
+            ):
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        else:
+            past_key_values = Zamba2HybridDynamicCache(
+                self.config, input_ids.shape[0], dtype=self.dtype, device=self.device
+            )
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if not empty_past_kv:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and empty_past_kv:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "logits_to_keep": self.config.num_logits_to_keep,
+                "cache_position": cache_position,
+            }
+        )
+
+        # Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
+        for key, value in kwargs.items():
+            if key not in model_inputs:
+                model_inputs[key] = value
+
+        return model_inputs
+
+
+@auto_docstring(
+    custom_intro="""
+    The Zamba2 Model with a sequence classification head on top (linear layer).
+
+    [`Zamba2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """
+)
+class Zamba2ForSequenceClassification(Zamba2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = Zamba2Model(config)
+        self._tied_weights_keys = self.model._tied_weights_keys
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = ["Zamba2ForCausalLM", "Zamba2ForSequenceClassification", "Zamba2Model", "Zamba2PreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/modular_zamba2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/modular_zamba2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d05b23721142fd3df14aa4808b6c18212e3aa6df
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/modular_zamba2.py
@@ -0,0 +1,1152 @@
+# coding=utf-8
+# Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import re
+from itertools import cycle
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    logging,
+)
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.import_utils import (
+    is_causal_conv1d_available,
+    is_mamba_ssm_available,
+)
+from ..llama.modeling_llama import LlamaRotaryEmbedding, apply_rotary_pos_emb
+from ..mamba2.modeling_mamba2 import pad_tensor_by_size, reshape_into_chunks, segment_sum
+from ..zamba.modeling_zamba import (
+    ZambaAttention,
+    ZambaAttentionDecoderLayer,
+    ZambaForCausalLM,
+    ZambaForSequenceClassification,
+    ZambaHybridDynamicCache,
+    ZambaHybridLayer,
+    ZambaMambaDecoderLayer,
+    ZambaModel,
+    ZambaRMSNorm,
+    eager_attention_forward,
+)
+from .configuration_zamba2 import Zamba2Config
+
+
+if is_mamba_ssm_available():
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+    from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined
+else:
+    selective_state_update, mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined = None, None, None
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+
+is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))
+
+
+_CONFIG_FOR_DOC = "Zyphra/Zamba2-2.7B"
+
+logger = logging.get_logger(__name__)
+
+
+class Zamba2RMSNormGated(torch.nn.Module):
+    def __init__(self, hidden_size, group_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.group_size = group_size
+
+    def forward(self, hidden_states, gate=None):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        if gate is not None:
+            hidden_states = hidden_states * nn.functional.silu(gate.to(torch.float32))
+        *prefix_dims, last_dim = hidden_states.shape
+        group_count = last_dim // self.group_size
+        hidden_states_group = hidden_states.view(*prefix_dims, group_count, self.group_size)
+        variance = hidden_states_group.pow(2).mean(-1, keepdim=True)
+        hidden_states_group = hidden_states_group * torch.rsqrt(variance + self.variance_epsilon)
+        hidden_states = hidden_states_group.view(*prefix_dims, group_count * self.group_size)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class Zamba2RMSNorm(ZambaRMSNorm):
+    pass
+
+
+class Zamba2HybridDynamicCache(ZambaHybridDynamicCache):
+    """
+    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
+    (which has a constant shape regardless of seq_len).
+
+    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
+    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
+    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
+    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
+    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
+    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
+    """
+
+    def __init__(
+        self, config: Zamba2Config, batch_size: int, dtype: torch.dtype = torch.float16, device: Optional[str] = None
+    ):
+        self.dtype = dtype
+        self.layers_block_type = config.layers_block_type
+        self.has_previous_state = False
+        self.intermediate_size = int(config.mamba_expand * config.hidden_size)
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.n_mamba_heads = config.n_mamba_heads
+        self.transformer_layers = []
+        self._modules = {}
+        self._parameters = {}
+        self._buffers = {}
+        self.conv_states = {}
+        self.ssm_states = {}
+        for i in range(config.num_hidden_layers):
+            self.conv_states[i] = torch.zeros(
+                batch_size,
+                self.intermediate_size + 2 * config.mamba_ngroups * config.mamba_d_state,
+                self.conv_kernel_size,
+                device=device,
+                dtype=dtype,
+            )
+            self.ssm_states[i] = torch.zeros(
+                batch_size, self.n_mamba_heads, config.mamba_headdim, self.ssm_state_size, device=device, dtype=dtype
+            )
+            if self.layers_block_type[i] == "hybrid":
+                self.transformer_layers.append(i)
+        self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+        self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+
+    def update_conv_state(
+        self, layer_idx: int, new_conv_state: torch.Tensor, cache_position: torch.LongTensor
+    ) -> torch.Tensor:
+        conv_state = self.conv_states[layer_idx]
+        cache_position = cache_position.clamp(0, self.conv_kernel_size - 1)
+
+        conv_state = conv_state.roll(shifts=-1, dims=-1)
+        conv_state[:, :, cache_position] = new_conv_state.to(conv_state.device)
+        self.conv_states[layer_idx].zero_()
+        self.conv_states[layer_idx] += conv_state
+        return self.conv_states[layer_idx]
+
+    def reset(self):
+        self.conv_states.zero_()
+        self.ssm_states.zero_()
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # take any layer that contains cache and not empty tensor
+        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+        if len(self.key_cache) <= layer_idx or self.key_cache[layer_idx].numel() == 0:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+
+
+class Zamba2RotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+
+
+class Zamba2Attention(ZambaAttention):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
+    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
+    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
+    (see fig. 2 in https://huggingface.co/papers/2405.16712).
+    Additionally, replaced
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
+    Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
+    layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
+    expressivity with a small memory overhead (see Fig. 2 of https://huggingface.co/papers/2411.15242).
+    """
+
+    def __init__(
+        self,
+        config: Zamba2Config,
+        layer_idx: Optional[int] = None,
+        num_fwd_mem_blocks: Optional[int] = None,
+        block_id: Optional[int] = None,
+    ):
+        super().__init__(config, layer_idx)
+        self.num_fwd_mem_blocks = num_fwd_mem_blocks
+        self.layer_block_map = config.hybrid_layer_ids
+        self.block_id = block_id
+
+        if config.use_shared_attention_adapter:
+            self.linear_q_adapter_list = nn.ModuleList([])
+            self.linear_k_adapter_list = nn.ModuleList([])
+            self.linear_v_adapter_list = nn.ModuleList([])
+
+            for i in range(self.num_fwd_mem_blocks):
+                if i % config.num_mem_blocks == block_id:
+                    linear_q_adapter = nn.Sequential(
+                        nn.Linear(self.attention_hidden_size, self.config.adapter_rank, bias=False),
+                        nn.Linear(self.config.adapter_rank, self.attention_hidden_size, bias=False),
+                    )
+                    linear_k_adapter = nn.Sequential(
+                        nn.Linear(self.attention_hidden_size, self.config.adapter_rank, bias=False),
+                        nn.Linear(self.config.adapter_rank, self.attention_hidden_size, bias=False),
+                    )
+                    linear_v_adapter = nn.Sequential(
+                        nn.Linear(self.attention_hidden_size, self.config.adapter_rank, bias=False),
+                        nn.Linear(self.config.adapter_rank, self.attention_hidden_size, bias=False),
+                    )
+                else:
+                    linear_q_adapter = nn.Identity()
+                    linear_k_adapter = nn.Identity()
+                    linear_v_adapter = nn.Identity()
+                self.linear_q_adapter_list.append(linear_q_adapter)
+                self.linear_k_adapter_list.append(linear_k_adapter)
+                self.linear_v_adapter_list.append(linear_v_adapter)
+
+        self.layer_dic = {value: index for index, value in enumerate(self.layer_block_map)}
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Zamba2HybridDynamicCache] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        if self.config.use_shared_attention_adapter:
+            adapter_layer_idx = self.layer_dic[layer_idx]
+            query_states = query_states + self.linear_q_adapter_list[adapter_layer_idx](hidden_states)
+            key_states = key_states + self.linear_k_adapter_list[adapter_layer_idx](hidden_states)
+            value_states = value_states + self.linear_v_adapter_list[adapter_layer_idx](hidden_states)
+
+        query_states = query_states.view(hidden_shape).transpose(1, 2)
+        key_states = key_states.view(hidden_shape).transpose(1, 2)
+        value_states = value_states.view(hidden_shape).transpose(1, 2)
+
+        if self.config.use_mem_rope:
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(key_states, value_states, layer_idx)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Zamba2MambaMixer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
+    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
+    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
+    and is why Mamba is called **selective** state spaces)
+    """
+
+    def __init__(self, config: Zamba2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = int(config.mamba_expand * self.hidden_size)
+        self.layer_idx = layer_idx
+        self.use_conv_bias = config.use_conv_bias
+        self.activation = "silu"
+        self.act = nn.SiLU()
+        self.use_mem_eff_path = config.use_mem_eff_path
+
+        self.n_groups = config.mamba_ngroups
+        self.head_dim = config.mamba_headdim
+        self.num_heads = self.config.n_mamba_heads
+        self.chunk_size = config.chunk_size
+
+        self.time_step_limit = config.time_step_limit
+        self.time_step_min = config.time_step_min
+        self.time_step_max = config.time_step_max
+
+        self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size
+        self.conv1d = nn.Conv1d(
+            in_channels=self.conv_dim,
+            out_channels=self.conv_dim,
+            bias=True,
+            kernel_size=config.mamba_d_conv,
+            groups=self.conv_dim,
+            padding=config.mamba_d_conv - 1,
+        )
+
+        # projection of the input hidden states
+        projection_size = self.intermediate_size + self.conv_dim + self.num_heads
+        self.in_proj = nn.Linear(
+            self.hidden_size,
+            projection_size,
+            bias=config.add_bias_linear,
+        )
+        # selective projection used to make dt, B and C input dependent
+
+        # time step projection (discretization)
+        # instantiate once and copy inv_dt in init_weights of PretrainedModel
+        self.dt_bias = nn.Parameter(torch.ones(self.num_heads))
+
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = torch.arange(1, self.num_heads + 1)
+        self.A_log = nn.Parameter(torch.log(A))
+        self.norm = Zamba2RMSNormGated(
+            self.intermediate_size, group_size=self.intermediate_size // self.n_groups, eps=1e-5
+        )
+        self.D = nn.Parameter(torch.ones(self.num_heads))
+
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.add_bias_linear)
+
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`"
+                " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and"
+                " https://github.com/Dao-AILab/causal-conv1d"
+            )
+
+    def cuda_kernels_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[Zamba2HybridDynamicCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # set up dimensions for reshapes later
+
+        batch_size, seq_len, _ = hidden_states.shape
+        groups_time_state_size = self.n_groups * self.ssm_state_size
+        d_to_remove = 2 * self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.num_heads
+
+        # getting projected states from cache if it exists
+        if cache_params is not None and cache_params.has_previous_state:
+            in_projected_states = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
+            d_mlp = (in_projected_states.shape[-1] - d_to_remove) // 2
+            split_projection_dim = [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads]
+            _, _, gate, hidden_states_B_C, dt = torch.split(in_projected_states, split_projection_dim, dim=-1)
+
+            hidden_states_B_C = causal_conv1d_update(
+                hidden_states_B_C,
+                cache_params.conv_states[self.layer_idx],
+                self.conv1d.weight.squeeze(1),
+                self.conv1d.bias,
+                self.activation,
+            )
+
+            hidden_states, B, C = torch.split(
+                hidden_states_B_C,
+                [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                dim=-1,
+            )
+            A = -torch.exp(self.A_log.float())  # (nheads,)
+
+            A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            dt = dt[:, :, None].expand(-1, -1, self.head_dim)
+            dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
+            D = self.D[:, None, ...].expand(-1, self.head_dim)
+            B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups)
+            C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups)
+            hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim)
+            hidden_states = selective_state_update(
+                cache_params.ssm_states[self.layer_idx],
+                hidden_states_reshaped,
+                dt,
+                A,
+                B,
+                C,
+                D,
+                z=None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+            )
+            hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
+            hidden_states = self.norm(hidden_states, gate)
+            out = self.out_proj(hidden_states)[:, None, ...]
+        # if no cache is found, calling the kernel
+        else:
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                dtype = hidden_states.dtype
+                hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+            # 1. Gated MLP's linear projection
+            projected_states = self.in_proj(hidden_states)
+            A = -torch.exp(self.A_log.float())  # (num_heads) or (intermediate_size, state_size)
+            dt_limit_kwargs = {} if self.time_step_limit is None else {"dt_limit": self.time_step_limit}
+            if attention_mask is not None:
+                input_not_masked = torch.all(attention_mask == 1)
+            else:
+                input_not_masked = True
+
+            if self.use_mem_eff_path and self.training and cache_params is None and input_not_masked:
+                out, ssm_state = mamba_split_conv1d_scan_combined(
+                    projected_states,
+                    self.conv1d.weight.squeeze(1),
+                    self.conv1d.bias,
+                    self.dt_bias,
+                    A,
+                    D=self.D,
+                    chunk_size=self.chunk_size,
+                    seq_idx=None,
+                    activation=self.activation,
+                    rmsnorm_weight=self.norm.weight,
+                    rmsnorm_eps=self.norm.variance_epsilon,
+                    outproj_weight=self.out_proj.weight,
+                    outproj_bias=self.out_proj.bias,
+                    headdim=self.head_dim,
+                    ngroups=self.n_groups,
+                    norm_before_gate=False,
+                    return_final_states=True,
+                    **dt_limit_kwargs,
+                )
+
+            else:
+                gate, hidden_states_B_C, time_step = torch.split(
+                    projected_states,
+                    [self.intermediate_size, self.conv_dim, self.num_heads],
+                    dim=-1,
+                )
+
+                # 1D Convolution
+                if cache_params is not None:
+                    hidden_states_B_C_t = hidden_states_B_C.transpose(1, 2)
+                    conv_state = nn.functional.pad(
+                        hidden_states_B_C_t, (self.conv_kernel_size - hidden_states_B_C_t.shape[-1], 0)
+                    )
+                    cache_params.conv_states[self.layer_idx].copy_(conv_state)
+                if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]:
+                    hidden_states_B_C = self.act(
+                        self.conv1d(hidden_states_B_C.transpose(1, 2)).transpose(1, 2)[:, :seq_len]
+                    )  # (B, L, self.d_inner + 2 * ngroups * d_state)
+                else:
+                    hidden_states_B_C = causal_conv1d_fn(
+                        x=hidden_states_B_C.transpose(1, 2),
+                        weight=self.conv1d.weight.squeeze(1),
+                        bias=self.conv1d.bias,
+                        activation=self.activation,
+                    ).transpose(1, 2)[:, :seq_len]
+                hidden_states, B, C = torch.split(
+                    hidden_states_B_C,
+                    [self.intermediate_size, groups_time_state_size, groups_time_state_size],
+                    dim=-1,
+                )
+                if attention_mask is not None and not torch.all(attention_mask == 1):
+                    # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                    dtype = hidden_states.dtype
+                    hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+                scan_output, ssm_state = mamba_chunk_scan_combined(
+                    hidden_states.view(batch_size, seq_len, -1, self.head_dim),
+                    time_step,
+                    A,
+                    B.view(batch_size, seq_len, self.n_groups, -1),
+                    C.view(batch_size, seq_len, self.n_groups, -1),
+                    chunk_size=self.chunk_size,
+                    D=self.D,
+                    z=None,
+                    seq_idx=None,
+                    return_final_states=True,
+                    dt_bias=self.dt_bias,
+                    dt_softplus=True,
+                    **dt_limit_kwargs,
+                )
+                if ssm_state is not None and cache_params is not None:
+                    cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+                scan_output = scan_output.view(batch_size, seq_len, -1)
+                # Multiply "gate" branch and apply extra normalization layer
+                scan_output = self.norm(scan_output, gate)
+                out = self.out_proj(scan_output)
+        return out
+
+    # fmt: off
+    def torch_forward(self, input_states, cache_params: Optional[Zamba2HybridDynamicCache]=None, attention_mask: Optional[torch.Tensor]=None):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+        # Gated MLP's linear projection
+        if cache_params is not None and cache_params.has_previous_state:
+            projected_states = self.in_proj(input_states.squeeze(1))
+        else:
+            if attention_mask is not None and not torch.all(attention_mask==1):
+                # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                input_states = (input_states * attention_mask[:, :, None]).to(dtype)
+            projected_states = self.in_proj(input_states)
+        d_mlp = (projected_states.shape[-1] - 2 * self.intermediate_size - 2 * self.n_groups * self.ssm_state_size- self.num_heads) // 2
+        _, _, gate, hidden_states, dt = projected_states.split(
+                [d_mlp, d_mlp, self.intermediate_size,  self.conv_dim, self.num_heads], dim=-1
+        )
+
+        # Convolution sequence transformation
+        if cache_params is not None:
+            ssm_state = cache_params.ssm_states[self.layer_idx].clone()
+            ssm_state = ssm_state.to(hidden_states.device)
+            if cache_params.has_previous_state:
+                gate = gate.unsqueeze(1)
+                conv_state = cache_params.conv_states[self.layer_idx]                   # [batch, intermediate_size, conv_kernel_size]
+                conv_state = torch.roll(conv_state, shifts=-1, dims=-1)
+                # handle batched generation - states are copied through
+                conv_state[:, :, -1] = hidden_states[:, 0, :] if hidden_states.ndim == 3 else hidden_states
+                cache_params.conv_states[self.layer_idx].copy_(conv_state)
+                hidden_states = torch.sum(conv_state.to(projected_states.device) * self.conv1d.weight[:, 0, :], dim=-1)
+                if self.use_conv_bias:
+                    hidden_states += self.conv1d.bias
+                hidden_states = self.act(hidden_states).to(dtype)[:, None, ...]         # [batch, 1, intermediate_size] : decoding
+            else:
+                hidden_states = hidden_states.transpose(1,2)
+                conv_state = nn.functional.pad(
+                    hidden_states,
+                    (self.conv_kernel_size - hidden_states.shape[-1], 0)
+                )
+                cache_params.conv_states[self.layer_idx].copy_(conv_state)
+                hidden_states = self.act(self.conv1d(hidden_states).transpose(1,2))[:, :seq_len, :]     # [batch, intermediate_size, seq_len]
+                if attention_mask is not None and not torch.all(attention_mask==1):
+                    dtype = hidden_states.dtype
+                    # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66
+                    hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+        else:
+            ssm_state = torch.zeros(
+                (batch_size, self.num_heads, self.head_dim, self.ssm_state_size),
+                device=hidden_states.device, dtype=dtype
+            )
+            hidden_states = self.act(self.conv1d(hidden_states.transpose(1, 2))[..., :seq_len].transpose(1, 2))
+        hidden_states, B, C = torch.split(hidden_states, [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], dim=-1)
+        A = -torch.exp(self.A_log.float())                            # [num_heads]
+        if cache_params is not None and cache_params.has_previous_state:
+            # Note: there is no need to pad parameter matrices here, as there is just one new token
+            # for batched generation
+            dt = dt[:, None, ...] if dt.ndim == 2 else dt[:, 0, :][:, None, ...]
+            dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim)
+            # [num_heads] -> [num_heads, head_dim]
+            dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim)
+
+            dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype))
+            dt = torch.clamp(dt, self.time_step_min) #, self.time_step_max)
+            A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            # [bsz, num_heads, head_dim, state_size]
+            dA = torch.exp(dt[..., None] * A)
+
+            # Discretize B
+            # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] ->
+            # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size]
+            B = B.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous()
+            B = B.reshape(batch_size, -1, B.shape[-1])
+            # [bsz, num_heads, head_dim, state_size]
+            dB = dt[..., None] * B[..., None, :]
+
+            # Discretize x into dB
+            # [bsz, intermediate_size] -> [bsz, num_heads, head_dim]
+            hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim)
+            dBx = dB * hidden_states[..., None]
+
+            # State calculation
+            cache_params.ssm_states[self.layer_idx].copy_(
+                cache_params.ssm_states[self.layer_idx] * dA + dBx
+            )
+
+            # Subsequent output
+            # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size]
+            C = C.reshape(batch_size, self.n_groups, -1)[..., None, :]
+            C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous()
+            C = C.reshape(batch_size, -1, C.shape[-1])
+            # [bsz, num_heads, head_dim]
+
+            ssm_states = cache_params.ssm_states[self.layer_idx].to(C.dtype)  # Shape: [b, h, d, n]
+            # Reshape ssm_states to merge the first two dimensions
+            ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size)  # Shape: [b*h, d, n]
+            C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1)  # Shape: [b*h, n, 1]
+            y = torch.bmm(ssm_states_reshaped, C_reshaped)
+            y = y.view(batch_size, self.num_heads, self.head_dim)
+
+            # D skip connection
+            # [num_heads] -> [num_heads, head_dim]
+            D = self.D[..., None].expand(self.D.shape[0], self.head_dim)
+            y = (y + hidden_states * D).to(y.dtype)
+
+            # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size]
+            y = y.reshape(batch_size, -1)[:, None, ...]
+        else:
+            # begin ssd naive implementation without einsums
+            dt = nn.functional.softplus(dt + self.dt_bias)
+            dt = torch.clamp(dt, self.time_step_min)
+            hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float()
+            B = B.reshape(batch_size, seq_len,  -1, self.ssm_state_size).float()
+            C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float()
+            B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads)
+            pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size
+
+            D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size)
+
+            # Discretize x and A
+            hidden_states = hidden_states * dt[..., None]
+            A = A.to(hidden_states.dtype) * dt
+
+            # Rearrange into blocks/chunks
+            hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)]
+
+
+            # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size]
+            A = A.permute(0, 3, 1, 2)
+            A_cumsum = torch.cumsum(A, dim=-1)
+
+            # 1. Compute the output for each intra-chunk (diagonal blocks)
+            # This is the analog of a causal mask
+            L = torch.exp(segment_sum(A))
+
+            # First, contraction of C and B to get G (attention-weights like)
+            G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, : ,:]  # shape: (b, c, l, s, h, n)
+            G = G_intermediate.sum(dim=-1)  # shape: (b, c, l, s, h)
+
+
+            # Step 2: Compute M, equivalent to applying attention mask to weights
+            M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None]
+            M = M_intermediate.sum(dim=-1)
+
+            # Step 3: Compute Y_diag (apply to values)
+            Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(3)
+
+            # (right term of low-rank factorization of off-diagonal blocks; B terms)
+
+            decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum)
+            B_decay_contraction = B * decay_states.permute(0, 2, 3, 1)[..., None]
+            # permute back B * decay states
+            states = (B_decay_contraction.permute(0, 1, 3, 2, 4)[..., None]  * hidden_states.permute(0, 1, 3, 2, 4)[..., None, :]).sum(dim=3).permute(0, 1, 2, 4, 3)
+            if cache_params is not None and cache_params.has_previous_state:
+                previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...]
+            else:
+                previous_states = torch.zeros_like(states[:, :1])
+            states = torch.cat([previous_states, states], dim=1)
+            decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0))))
+
+            states_permuted = states.permute(0, 2, 1, 3, 4)
+            result = (decay_chunk[..., None, None] * states_permuted[:, :, None, ...]).sum(dim=2)
+            new_states = result.permute(0, 2, 1, 3, 4)
+            states, ssm_state = new_states[:, :-1], new_states[:, -1]
+
+            # Compute state -> output conversion per chunk
+            # (left term of low-rank factorization of off-diagonal blocks; C terms)
+            state_decay_out = torch.exp(A_cumsum)
+            # compute Yoff
+            C_times_states = (C[..., None, :] * states[:, :, None, ...])
+            state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1)
+            Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None])
+            # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks)
+
+            y = Y_diag + Y_off
+            # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim]
+            y = y.reshape(batch_size, -1, self.num_heads, self.head_dim)
+
+            y = y + D_residual
+            # Cutting off padded chunks
+            if pad_size > 0:
+                y = y[:, :seq_len, :, :]
+            y = y.reshape(batch_size, seq_len, -1)
+            if ssm_state is not None and cache_params is not None:
+                cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+
+        scan_output = self.norm(y, gate)
+
+        # end ssd naive
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_output.to(dtype))  # [batch, seq_len, hidden_size]
+        return contextualized_states
+    # fmt: on
+
+    def forward(
+        self,
+        hidden_states,
+        cache_params: Optional[Zamba2HybridDynamicCache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        if is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
+            return self.cuda_kernels_forward(hidden_states, cache_params, attention_mask)
+
+        return self.torch_forward(hidden_states, cache_params, attention_mask)
+
+
+class Zamba2MLP(nn.Module):
+    def __init__(self, config: Zamba2Config, num_fwd_mem_blocks=None, block_id: Optional[int] = None):
+        """
+        This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
+        is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
+        """
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.num_fwd_mem_blocks = num_fwd_mem_blocks
+        self.block_id = block_id
+
+        self.gate_up_proj = nn.Linear(self.hidden_size, 2 * self.intermediate_size, bias=config.add_bias_linear)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.add_bias_linear)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+        self.gate_up_proj_adapter_list = nn.ModuleList([])
+        for i in range(self.num_fwd_mem_blocks):
+            if i % config.num_mem_blocks == block_id:
+                gate_up_proj_adapter = nn.Sequential(
+                    nn.Linear(self.config.hidden_size, self.config.adapter_rank, bias=False),
+                    nn.Linear(self.config.adapter_rank, 2 * self.intermediate_size, bias=False),
+                )
+            else:
+                gate_up_proj_adapter = nn.Identity()
+            self.gate_up_proj_adapter_list.append(gate_up_proj_adapter)
+
+        layer_block_map = config.hybrid_layer_ids
+        self.layer_dic = {value: index for index, value in enumerate(layer_block_map)}
+
+    def forward(self, hidden_state, layer_idx=None):
+        gate_up_state = self.gate_up_proj(hidden_state)
+        layer_idx = self.layer_dic[layer_idx]
+        gate_up_state = gate_up_state + self.gate_up_proj_adapter_list[layer_idx](hidden_state)
+
+        gate_up_state = torch.chunk(gate_up_state, 2, dim=-1)
+        hidden_state = self.act_fn(gate_up_state[0]) * gate_up_state[1]
+        output = self.down_proj(hidden_state)
+        return output
+
+
+class Zamba2AttentionDecoderLayer(ZambaAttentionDecoderLayer):
+    def __init__(self, config: Zamba2Config, block_id: Optional[int] = None, layer_idx: Optional[int] = None):
+        self.block_id = block_id
+        num_gs = len(config.hybrid_layer_ids)
+        super().__init__(config, layer_idx)
+        self.self_attn = Zamba2Attention(config, layer_idx=-1, num_fwd_mem_blocks=num_gs, block_id=block_id)
+        self.feed_forward = Zamba2MLP(config, num_fwd_mem_blocks=num_gs, block_id=block_id)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Zamba2HybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        position_embeddings: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
+            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
+                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
+                concatenated tensor is then used as input of the pre-attention RMSNorm
+                (see fig. 2 in https://huggingface.co/papers/2405.16712).
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+        """
+        hidden_states = torch.concatenate([hidden_states, original_hidden_states], dim=-1)
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            layer_idx=layer_idx,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states, layer_idx)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class Zamba2MambaDecoderLayer(ZambaMambaDecoderLayer):
+    def __init__(self, config: Zamba2Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.mamba = Zamba2MambaMixer(config=config, layer_idx=layer_idx)
+        self.input_layernorm = Zamba2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+
+class Zamba2HybridLayer(ZambaHybridLayer):
+    def __init__(
+        self, shared_transformer: Zamba2AttentionDecoderLayer, linear: nn.Linear, mamba: Zamba2MambaDecoderLayer
+    ):
+        super().__init__(shared_transformer, linear, mamba)
+        del self.shared_transf
+        self.shared_transformer = shared_transformer
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: Optional[torch.Tensor] = None,
+        layer_idx: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Zamba2HybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        position_embeddings: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
+            hidden activations to form the input of the shared transformer layer.
+            layer_idx (`int`): layer number.
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_values (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
+                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
+                with `head_dim` being the embedding dimension of each attention head.
+        """
+
+        layer_outputs = self.shared_transformer(
+            hidden_states,
+            original_hidden_states=original_hidden_states,
+            layer_idx=layer_idx,
+            attention_mask=causal_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            position_embeddings=position_embeddings,
+        )
+
+        transformer_hidden_states = layer_outputs[0]
+
+        if output_attentions:
+            self_attn_weights = layer_outputs[1]
+
+        transformer_hidden_states = self.linear(transformer_hidden_states)
+
+        layer_outputs = self.mamba_decoder(
+            hidden_states,
+            transformer_hidden_states=transformer_hidden_states,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            position_embeddings=position_embeddings,
+        )
+
+        if output_attentions:
+            layer_outputs = (layer_outputs[0], self_attn_weights) + layer_outputs[2:]
+
+        return layer_outputs
+
+
+class Zamba2PreTrainedModel(PreTrainedModel):
+    config: Zamba2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Zamba2AttentionDecoderLayer", "Zamba2MambaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_sdpa = True
+    # Note: only supports Zamba2HybridDynamicCache
+    _is_stateful = True
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, Zamba2MambaMixer):
+            dt = torch.exp(
+                torch.rand(self.config.n_mamba_heads)
+                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                + math.log(self.config.time_step_min)
+            ).clamp(min=self.config.time_step_floor)
+            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            module.dt_bias.data.copy_(inv_dt)
+
+            A = torch.arange(1, module.num_heads + 1)
+            module.A_log.data.copy_(torch.log(A))
+            module.D.data.fill_(1.0)
+
+
+class Zamba2Model(ZambaModel, Zamba2PreTrainedModel):
+    """
+    Model consisting of *config.num_hidden_layers* layers.
+
+    Args:
+        config: Zamba2Config
+    """
+
+    def __init__(self, config: Zamba2Config):
+        Zamba2PreTrainedModel.__init__(self, config)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        blocks = [Zamba2AttentionDecoderLayer(config, block_id=k) for k in range(config.num_mem_blocks)]
+        mamba_layers = []
+        linear_layers = []
+        self.layers_block_type = config.layers_block_type
+        for i in range(config.num_hidden_layers):
+            if config.layers_block_type[i] == "mamba":
+                mamba_layers.append(Zamba2MambaDecoderLayer(config, layer_idx=i))
+            elif config.layers_block_type[i] == "hybrid":
+                linear_layers.append(nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=False))
+                mamba_layers.append(Zamba2MambaDecoderLayer(config, layer_idx=i))
+        mamba_layers = iter(mamba_layers)
+        linear_layers = iter(linear_layers)
+        blocks = cycle(blocks)
+        layers = self.get_layers(blocks, linear_layers, mamba_layers)
+        self.layers = nn.ModuleList(layers)
+
+        self._attn_implementation = config._attn_implementation
+        self.final_layernorm = Zamba2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if config.use_mem_rope:
+            if config.use_long_context:
+                logger.warning_once(
+                    "`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`."
+                )
+            self.rotary_emb = Zamba2RotaryEmbedding(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_layers(self, blocks, linear_layers, mamba_layers):
+        layers = []
+        self._tied_weights_keys = []
+        self.first_transformer_layer_id = 0
+        for layer_id, layer_type in enumerate(self.layers_block_type):
+            if layer_type == "hybrid":
+                if self.first_transformer_layer_id == 0:
+                    self.first_transformer_layer_id = layer_id
+                block = next(blocks)
+                if self.config.num_mem_blocks * len(self.config.hybrid_layer_ids) > 1:
+                    prefix_pattern = rf"^layers\.{layer_id}\.shared_transformer\."
+                    main_keys_pattern = re.compile(
+                        prefix_pattern
+                        + r"(?:"
+                        + r"self_attn\.(?:q_proj|k_proj|v_proj|o_proj)\.weight|"
+                        + r"feed_forward\.(?:gate_up_proj|down_proj)\.weight|"
+                        + r"(?:input_layernorm|pre_ff_layernorm)\.weight"
+                        + r")$"
+                    )
+                    self._tied_weights_keys.append(main_keys_pattern)
+
+                    adapter_id = 0
+                    for _layer_type in self.layers_block_type:
+                        if _layer_type == "hybrid" and adapter_id % self.config.num_mem_blocks == block.block_id:
+                            adapter_pattern = re.compile(
+                                r"^shared_transformer\.feed_forward\.gate_up_proj_adapter_list\."
+                                + str(adapter_id)
+                                + r"\.(?:0|1)\.weight$"
+                            )
+                            self._tied_weights_keys.append(adapter_pattern)
+                        adapter_id += 1
+                    if self.config.use_shared_attention_adapter:
+                        adapter_id = 0
+                        for _layer_type in self.layers_block_type:
+                            if _layer_type == "hybrid" and adapter_id % self.config.num_mem_blocks == block.block_id:
+                                attn_adapter_pattern = re.compile(
+                                    r"^shared_transformer\.self_attn\."
+                                    + r"(?:linear_q_adapter_list|linear_k_adapter_list|linear_v_adapter_list)\."
+                                    + str(adapter_id)
+                                    + r"\.(?:0|1)\.weight$"
+                                )
+                                self._tied_weights_keys.append(attn_adapter_pattern)
+                            adapter_id += 1
+                layers.append(Zamba2HybridLayer(block, next(linear_layers), next(mamba_layers)))
+            else:
+                layers.append(next(mamba_layers))
+        return layers
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Zamba2HybridDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+
+        original_hidden_states = torch.clone(inputs_embeds)
+        # original_hidden_states: word embedding output that will be concatenated with hidden activations to form the input of the shared transformer layer
+
+        if use_cache and past_key_values is None:
+            batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
+            past_key_values = Zamba2HybridDynamicCache(self.config, batch_size, dtype=self.dtype, device=self.device)
+
+        if cache_position is None:
+            past_seen_tokens = (
+                past_key_values.get_seq_length(layer_idx=self.first_transformer_layer_id)
+                if past_key_values is not None
+                else 0
+            )
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
+
+        # create position embeddings to be shared across the decoder layers
+        if self.config.use_mem_rope:
+            position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        else:
+            position_embeddings = None
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for layer_idx, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer.__call__,
+                    hidden_states,
+                    original_hidden_states,
+                    layer_idx,
+                    attention_mask,
+                    causal_mask,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = layer(
+                    hidden_states,
+                    original_hidden_states=original_hidden_states,
+                    layer_idx=layer_idx,
+                    attention_mask=attention_mask,
+                    causal_mask=causal_mask,
+                    past_key_values=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                if layer_outputs[1] is not None:
+                    # append attentions only of attention layers. Mamba layers return `None` as the attention weights
+                    all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if past_key_values is not None and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+        return output if return_dict else output.to_tuple()
+
+
+class Zamba2ForCausalLM(ZambaForCausalLM):
+    pass
+
+
+class Zamba2ForSequenceClassification(ZambaForSequenceClassification):
+    pass
+
+
+__all__ = [
+    "Zamba2ForCausalLM",
+    "Zamba2ForSequenceClassification",
+    "Zamba2Model",
+    "Zamba2PreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zoedepth/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zoedepth/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..186ecf6a3665d5b95937fef96228cbd29913fd7d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zoedepth/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zoedepth/__pycache__/configuration_zoedepth.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zoedepth/__pycache__/configuration_zoedepth.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a617b9bc36a61a480a24a5aa751b7c728ac1584c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zoedepth/__pycache__/configuration_zoedepth.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zoedepth/__pycache__/image_processing_zoedepth.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zoedepth/__pycache__/image_processing_zoedepth.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..384b7cbd0b11c0400c6d69c01c0c6cb351ea44c4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zoedepth/__pycache__/image_processing_zoedepth.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zoedepth/__pycache__/image_processing_zoedepth_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zoedepth/__pycache__/image_processing_zoedepth_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61153779365097d09d2a5ef0a92e1588d316568a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zoedepth/__pycache__/image_processing_zoedepth_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zoedepth/__pycache__/modeling_zoedepth.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zoedepth/__pycache__/modeling_zoedepth.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..936b4d35e5195a95fbf3e1240b4be3e2c0daccc0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zoedepth/__pycache__/modeling_zoedepth.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aac65eec0d46241f130ee362ec3c1578cfafd6d7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/audio_classification.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/audio_classification.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f7b20f1b3be9e1aa7e1eb67e88d1bfe6d97a352
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/audio_classification.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/audio_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/audio_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d58dfe1c921a396ac1df601963ac11acd52f54b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/audio_utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/automatic_speech_recognition.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/automatic_speech_recognition.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c2a22aa9ef180d75d80624432a4006b49ca6c23
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/automatic_speech_recognition.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/base.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/base.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3b7cb1a4730f90abd4ccf02839d4f2b8e2592cf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/base.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/depth_estimation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/depth_estimation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86642058e270839460d014bb4bc6e94c2300f3f5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/depth_estimation.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/document_question_answering.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/document_question_answering.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d757ae3f3c2aaade86b740d023bd71183358144
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/document_question_answering.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/feature_extraction.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/feature_extraction.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..313725c40c212706c51e01b41c2889b92d8288b8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/feature_extraction.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/fill_mask.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/fill_mask.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c3f1894c1797333828bd839845d6061862753a4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/fill_mask.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_classification.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_classification.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48eb352b2a8008337ba1b2d91d33b1dc1ed8f4f8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_classification.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_feature_extraction.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_feature_extraction.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5c99e993543a9bdf239a9f59c6b5e0402dc11e0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_feature_extraction.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_segmentation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_segmentation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..096d027b3a1f46191415d0a13d9ce39829193969
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_segmentation.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_text_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_text_to_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24a82d99390ff618a64cfdc183880470373bbc08
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_text_to_text.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_to_image.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_to_image.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51f9c359e6924add8d39d46ecf81f086a5d9e6cc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_to_image.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_to_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3c4ad14bd954abd0e6734a973e5fb705b0db67b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/image_to_text.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/keypoint_matching.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/keypoint_matching.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f19a095089fe620f1c3b106ba270e901b8087faa
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/keypoint_matching.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/mask_generation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/mask_generation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fac8ef3a241419c19b16c937020aae4ccacd7d9d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/mask_generation.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/object_detection.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/object_detection.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50a1c0ca4afc3dc901a56130581dc73845392188
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/object_detection.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/pt_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/pt_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9de8f82a5b8002c1f5000743258163bc25cdeb98
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/pt_utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/question_answering.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/question_answering.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebe33e6d217b7588db3e4d61c76bc77e0ec65ff0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/question_answering.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/table_question_answering.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/table_question_answering.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..802bb302ecd173b3f36cbd52b044b5972bf35597
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/table_question_answering.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/text2text_generation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/text2text_generation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74046e8746fad07ae478ae7882c85b18a9cb59f8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/text2text_generation.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/text_classification.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/text_classification.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4387fb56a7adcf5ae95e425653b5dbd0f1fd2b87
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/text_classification.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/text_generation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/text_generation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed59de1b7e4c126bc73ab968dd3015ff69db4249
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/text_generation.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/text_to_audio.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/text_to_audio.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6582e23f2eb4165396a753588f0e0a964422324
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/text_to_audio.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/token_classification.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/token_classification.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69292b9972b591f3d9cbf7ebb312c7bba65120b8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/token_classification.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/video_classification.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/video_classification.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd9bdc591ef18cc510f582762224ff088e5346d1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/video_classification.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/visual_question_answering.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/visual_question_answering.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..966a7f5edb8588e2840340595d18505f411ac63d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/visual_question_answering.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/zero_shot_audio_classification.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/zero_shot_audio_classification.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4c16c3ed501e28ebabc1daf55c81f0c4dd73097
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/zero_shot_audio_classification.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/zero_shot_classification.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/zero_shot_classification.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79a6091f2d7e3b68651ef3fe3387bb4e1f7af1f8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/zero_shot_classification.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/zero_shot_image_classification.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/zero_shot_image_classification.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30e9c87de67ae7758cf1ec41ff81aae23ea4f77d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/zero_shot_image_classification.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/zero_shot_object_detection.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/zero_shot_object_detection.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4e0dc2a4114b151443ea5774fda048fc5a2d0a5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/pipelines/__pycache__/zero_shot_object_detection.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2050ece8a9ef014e0d9e1e3a2914998590ac865e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6fce28c771bd3423878148cee34cec1c8d022cce
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/compiler.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65c95826f53a5e7643dd1264f29b509f8a8be732
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/__pycache__/driver.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/channel_descriptor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/channel_descriptor.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4fba89435ec69efeddaaaacfe2b6e2f4144dd34
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/channel_descriptor.h
@@ -0,0 +1,597 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CHANNEL_DESCRIPTOR_H__)
+#define __CHANNEL_DESCRIPTOR_H__
+
+#if defined(__cplusplus)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * \addtogroup CUDART_HIGHLEVEL
+ *
+ * @{
+ */
+
+/**
+ * \brief \hl Returns a channel descriptor using the specified format
+ *
+ * Returns a channel descriptor with format \p f and number of bits of each
+ * component \p x, \p y, \p z, and \p w.  The ::cudaChannelFormatDesc is
+ * defined as:
+ * \code
+  struct cudaChannelFormatDesc {
+    int x, y, z, w;
+    enum cudaChannelFormatKind f;
+  };
+ * \endcode
+ *
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat,
+ * ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2,
+ * ::cudaChannelFormatKindSignedNormalized8X4,
+ * ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2,
+ * ::cudaChannelFormatKindUnsignedNormalized8X4,
+ * ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2,
+ * ::cudaChannelFormatKindSignedNormalized16X4,
+ * ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2,
+ * ::cudaChannelFormatKindUnsignedNormalized16X4, 
+ * ::cudaChannelFormatKindUnsignedNormalized1010102
+ * or ::cudaChannelFormatKindNV12.
+ *
+ * The format is specified by the template specialization.
+ *
+ * The template function specializes for the following scalar types:
+ * char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float.
+ * The template function specializes for the following vector types:
+ * char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}.
+ * The template function specializes for following cudaChannelFormatKind enum values:
+ * ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, 
+ * ::cudaChannelFormatKindUnsignedNormalized1010102
+ * and ::cudaChannelFormatKindNV12.
+ *
+ * Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone
+ *
+ * \return
+ * Channel descriptor with format \p f
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
+ * ::cudaGetChannelDesc, 
+ */
+template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+  return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
+{
+  int e = (int)sizeof(char) * 8;
+
+#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+#if !defined(__LP64__)
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+#endif /* !__LP64__ */
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
+{
+    int e = (int)sizeof(char) * 8;
+
+    return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
+}
+
+template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+    return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
+}
+
+/* Signed 8-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
+}
+
+/* Unsigned 8-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
+}
+
+/* Signed 16-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void)
+{
+    return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
+}
+
+/* Unsigned 16-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void)
+{
+    return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
+}
+
+/* NV12 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
+}
+
+/* Int101010 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized1010102>(void)
+{
+    return cudaCreateChannelDesc(10, 10, 10, 2, cudaChannelFormatKindUnsignedNormalized1010102);
+}
+
+/* BC1 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
+}
+
+/* BC1sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
+}
+
+/* BC2 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
+}
+
+/* BC2sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
+}
+
+/* BC3 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
+}
+
+/* BC3sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
+}
+
+/* BC4 unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
+}
+
+/* BC4 signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
+}
+
+/* BC5 unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
+}
+
+/* BC5 signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
+}
+
+/* BC6H unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
+}
+
+/* BC6H signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
+}
+
+/* BC7 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
+}
+
+/* BC7sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
+}
+
+#endif /* __cplusplus */
+
+/** @} */
+/** @} */ /* END CUDART_TEXTURE_HL */
+
+#endif /* !__CHANNEL_DESCRIPTOR_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups.h
new file mode 100644
index 0000000000000000000000000000000000000000..0532a97bbaba37b6aa8540426d9d89adef6f4612
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cooperative_groups.h
@@ -0,0 +1,1743 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _COOPERATIVE_GROUPS_H_
+#define _COOPERATIVE_GROUPS_H_
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#include "cooperative_groups/details/info.h"
+#include "cooperative_groups/details/driver_abi.h"
+#include "cooperative_groups/details/helpers.h"
+#include "cooperative_groups/details/memory.h"
+
+#if defined(_CG_HAS_STL_ATOMICS)
+#include <cuda/atomic>
+#define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope;
+#else
+#define _CG_THREAD_SCOPE(scope)
+#endif
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+    _CG_CONST_DECL unsigned int coalesced_group_id = 1;
+    _CG_CONST_DECL unsigned int multi_grid_group_id = 2;
+    _CG_CONST_DECL unsigned int grid_group_id = 3;
+    _CG_CONST_DECL unsigned int thread_block_id = 4;
+    _CG_CONST_DECL unsigned int multi_tile_group_id = 5;
+    _CG_CONST_DECL unsigned int cluster_group_id = 6;
+}
+
+/**
+ * class thread_group;
+ *
+ * Generic thread group type, into which all groups are convertible.
+ * It acts as a container for all storage necessary for the derived groups,
+ * and will dispatch the API calls to the correct derived group. This means
+ * that all derived groups must implement the same interface as thread_group.
+ */
+class thread_group
+{
+protected:
+    struct group_data {
+        unsigned int _unused : 1;
+        unsigned int type : 7, : 0;
+    };
+
+    struct gg_data  {
+        details::grid_workspace *gridWs;
+    };
+
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    struct mg_data  {
+        unsigned long long _unused : 1;
+        unsigned long long type    : 7;
+        unsigned long long handle  : 56;
+        const details::multi_grid::multi_grid_functions *functions;
+    };
+#endif
+
+    struct tg_data {
+        unsigned int is_tiled : 1;
+        unsigned int type : 7;
+        unsigned int size : 24;
+        // packed to 4b
+        unsigned int metaGroupSize : 16;
+        unsigned int metaGroupRank : 16;
+        // packed to 8b
+        unsigned int mask;
+        // packed to 12b
+        unsigned int _res;
+    };
+
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend class thread_block;
+
+    union __align__(8) {
+        group_data  group;
+        tg_data     coalesced;
+        gg_data     grid;
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+        mg_data     multi_grid;
+#endif
+    } _data;
+
+    _CG_QUALIFIER thread_group operator=(const thread_group& src);
+
+    _CG_QUALIFIER thread_group(unsigned int type) {
+        _data.group.type = type;
+        _data.group._unused = false;
+    }
+
+#ifdef _CG_CPP11_FEATURES
+    static_assert(sizeof(tg_data) <= 16, "Failed size check");
+    static_assert(sizeof(gg_data) <= 16, "Failed size check");
+#  ifdef _CG_ABI_EXPERIMENTAL
+    static_assert(sizeof(mg_data) <= 16, "Failed size check");
+#  endif
+#endif
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
+
+    _CG_QUALIFIER unsigned long long size() const;
+    _CG_QUALIFIER unsigned long long num_threads() const;
+    _CG_QUALIFIER unsigned long long thread_rank() const;
+    _CG_QUALIFIER void sync() const;
+    _CG_QUALIFIER unsigned int get_type() const {
+        return _data.group.type;
+    }
+
+};
+
+template <unsigned int TyId>
+struct thread_group_base : public thread_group {
+    _CG_QUALIFIER thread_group_base() : thread_group(TyId) {}
+    _CG_STATIC_CONST_DECL unsigned int id = TyId;
+};
+
+#if defined(_CG_HAS_MULTI_GRID_GROUP)
+
+/**
+ * class multi_grid_group;
+ *
+ * Threads within this this group are guaranteed to be co-resident on the
+ * same system, on multiple devices within the same launched kernels.
+ * To use this group, the kernel must have been launched with
+ * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
+ * and the device must support it (queryable device attribute).
+ *
+ * Constructed via this_multi_grid();
+ */
+
+
+# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+class multi_grid_group;
+
+// Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls
+template <typename = void>
+__device__ _CG_DEPRECATED multi_grid_group this_multi_grid();
+
+class multi_grid_group : public thread_group_base<details::multi_grid_group_id>
+{
+private:
+    template <typename = void>
+    _CG_QUALIFIER multi_grid_group() {
+        _data.multi_grid.functions = details::multi_grid::load_grid_intrinsics();
+        _data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle();
+    }
+
+    friend multi_grid_group this_multi_grid<void>();
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
+
+    _CG_QUALIFIER bool is_valid() const {
+        return (_data.multi_grid.handle != 0);
+    }
+
+    _CG_QUALIFIER void sync() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        _data.multi_grid.functions->sync(_data.multi_grid.handle);
+    }
+
+    _CG_QUALIFIER unsigned long long num_threads() const {
+        _CG_ASSERT(is_valid());
+        return _data.multi_grid.functions->size(_data.multi_grid.handle);
+    }
+
+    _CG_QUALIFIER unsigned long long size() const {
+        return num_threads();
+    }
+
+    _CG_QUALIFIER unsigned long long thread_rank() const {
+        _CG_ASSERT(is_valid());
+        return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle);
+    }
+
+    _CG_QUALIFIER unsigned int grid_rank() const {
+        _CG_ASSERT(is_valid());
+        return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle));
+    }
+
+    _CG_QUALIFIER unsigned int num_grids() const {
+        _CG_ASSERT(is_valid());
+        return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle));
+    }
+};
+# else
+class multi_grid_group
+{
+private:
+    unsigned long long _handle;
+    unsigned int _size;
+    unsigned int _rank;
+
+    friend _CG_QUALIFIER multi_grid_group this_multi_grid();
+
+    _CG_QUALIFIER multi_grid_group() {
+        _handle = details::multi_grid::get_intrinsic_handle();
+        _size = details::multi_grid::size(_handle);
+        _rank = details::multi_grid::thread_rank(_handle);
+    }
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
+
+    _CG_QUALIFIER _CG_DEPRECATED bool is_valid() const {
+        return (_handle != 0);
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED void sync() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        details::multi_grid::sync(_handle);
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const {
+        _CG_ASSERT(is_valid());
+        return _size;
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const {
+        return num_threads();
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const {
+        _CG_ASSERT(is_valid());
+        return _rank;
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const {
+        _CG_ASSERT(is_valid());
+        return (details::multi_grid::grid_rank(_handle));
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const {
+        _CG_ASSERT(is_valid());
+        return (details::multi_grid::num_grids(_handle));
+    }
+};
+# endif
+
+/**
+ * multi_grid_group this_multi_grid()
+ *
+ * Constructs a multi_grid_group
+ */
+# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+template <typename>
+__device__
+#else
+_CG_QUALIFIER
+# endif
+_CG_DEPRECATED
+multi_grid_group this_multi_grid()
+{
+    return multi_grid_group();
+}
+#endif
+
+/**
+ * class grid_group;
+ *
+ * Threads within this this group are guaranteed to be co-resident on the
+ * same device within the same launched kernel. To use this group, the kernel
+ * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
+ * and the device must support it (queryable device attribute).
+ *
+ * Constructed via this_grid();
+ */
+class grid_group : public thread_group_base<details::grid_group_id>
+{
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id;
+    friend _CG_QUALIFIER grid_group this_grid();
+
+private:
+    _CG_QUALIFIER grid_group(details::grid_workspace *gridWs) {
+        _data.grid.gridWs = gridWs;
+    }
+
+ public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
+
+    _CG_QUALIFIER bool is_valid() const {
+        return (_data.grid.gridWs != NULL);
+    }
+
+    _CG_QUALIFIER void sync() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        details::grid::sync(&_data.grid.gridWs->barrier);
+    }
+
+#if defined(_CG_CPP11_FEATURES)
+    using arrival_token = unsigned int;
+
+    _CG_QUALIFIER arrival_token barrier_arrive() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        return details::grid::barrier_arrive(&_data.grid.gridWs->barrier);
+    }
+
+    _CG_QUALIFIER void barrier_wait(arrival_token&& token) const {
+        details::grid::barrier_wait(token, &_data.grid.gridWs->barrier);
+    }
+#endif
+
+    _CG_STATIC_QUALIFIER unsigned long long size() {
+        return details::grid::size();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 group_dim() {
+        return details::grid::grid_dim();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_threads() {
+        return details::grid::dim_threads();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long num_threads() {
+        return details::grid::num_threads();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 thread_index() {
+        return details::grid::thread_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long thread_rank() {
+        return details::grid::thread_rank();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_blocks() {
+        return details::grid::dim_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long num_blocks() {
+        return details::grid::num_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 block_index() {
+        return details::grid::block_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long block_rank() {
+        return details::grid::block_rank();
+    }
+
+# if defined(_CG_HAS_CLUSTER_GROUP)
+    _CG_STATIC_QUALIFIER dim3 dim_clusters() {
+        return details::grid::dim_clusters();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long num_clusters() {
+        return details::grid::num_clusters();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 cluster_index() {
+        return details::grid::cluster_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
+        return details::grid::cluster_rank();
+    }
+# endif
+};
+
+_CG_QUALIFIER grid_group this_grid() {
+    // Load a workspace from the driver
+    grid_group gg(details::get_grid_workspace());
+#ifdef _CG_DEBUG
+    // *all* threads must be available to synchronize
+    gg.sync();
+#endif // _CG_DEBUG
+    return gg;
+}
+
+#if defined(_CG_HAS_CLUSTER_GROUP)
+/**
+ * class cluster_group
+ *
+ * Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly
+ * divided along all dimensions to form groups of blocks, each group of which is
+ * a block cluster. Clustered grids are subject to various restrictions and
+ * limitations. Primarily, a cluster consists of at most 8 blocks by default
+ * (although the user is allowed to opt-in to non-standard sizes,) and clustered
+ * grids are subject to additional occupancy limitations due to per-cluster
+ * hardware resource consumption. In exchange, a block cluster is guaranteed to
+ * be a cooperative group, with access to all cooperative group capabilities, as
+ * well as cluster specific capabilities and accelerations. A cluster_group
+ * represents a block cluster.
+ *
+ * Constructed via this_cluster_group();
+ */
+class cluster_group : public thread_group_base<details::cluster_group_id>
+{
+    // Friends
+    friend _CG_QUALIFIER cluster_group this_cluster();
+
+    // Disable constructor
+    _CG_QUALIFIER cluster_group()
+    {
+    }
+
+ public:
+    //_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster)
+
+    using arrival_token = struct {};
+
+    // Functionality exposed by the group
+    _CG_STATIC_QUALIFIER void sync()
+    {
+        return details::cluster::sync();
+    }
+
+    _CG_STATIC_QUALIFIER arrival_token barrier_arrive()
+    {
+        details::cluster::barrier_arrive();
+        return arrival_token();
+    }
+
+    _CG_STATIC_QUALIFIER void barrier_wait()
+    {
+        return details::cluster::barrier_wait();
+    }
+
+    _CG_STATIC_QUALIFIER void barrier_wait(arrival_token&&)
+    {
+        return details::cluster::barrier_wait();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
+    {
+        return details::cluster::query_shared_rank(addr);
+    }
+
+    template <typename T>
+    _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
+    {
+        return details::cluster::map_shared_rank(addr, rank);
+    }
+
+    _CG_STATIC_QUALIFIER dim3 block_index()
+    {
+        return details::cluster::block_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int block_rank()
+    {
+        return details::cluster::block_rank();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 thread_index()
+    {
+        return details::cluster::thread_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int thread_rank()
+    {
+        return details::cluster::thread_rank();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_blocks()
+    {
+        return details::cluster::dim_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int num_blocks()
+    {
+        return details::cluster::num_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_threads()
+    {
+        return details::cluster::dim_threads();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int num_threads()
+    {
+        return details::cluster::num_threads();
+    }
+
+    // Legacy aliases
+    _CG_STATIC_QUALIFIER unsigned int size()
+    {
+        return num_threads();
+    }
+};
+
+/*
+ * cluster_group this_cluster()
+ *
+ * Constructs a cluster_group
+ */
+_CG_QUALIFIER cluster_group this_cluster()
+{
+    cluster_group cg;
+#ifdef _CG_DEBUG
+    cg.sync();
+#endif
+    return cg;
+}
+#endif
+
+#if defined(_CG_CPP11_FEATURES)
+class thread_block;
+template <unsigned int MaxBlockSize>
+_CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
+#endif
+
+/**
+ * class thread_block
+ *
+ * Every GPU kernel is executed by a grid of thread blocks, and threads within
+ * each block are guaranteed to reside on the same streaming multiprocessor.
+ * A thread_block represents a thread block whose dimensions are not known until runtime.
+ *
+ * Constructed via this_thread_block();
+ */
+class thread_block : public thread_group_base<details::thread_block_id>
+{
+    // Friends
+    friend _CG_QUALIFIER thread_block this_thread_block();
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
+
+#if defined(_CG_CPP11_FEATURES)
+    template <unsigned int MaxBlockSize>
+    friend _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
+    template <unsigned int Size>
+    friend class __static_size_multi_warp_tile_base;
+
+    details::multi_warp_scratch* const tile_memory;
+
+    template <unsigned int MaxBlockSize>
+    _CG_QUALIFIER thread_block(block_tile_memory<MaxBlockSize>& scratch) :
+        tile_memory(details::get_scratch_ptr(&scratch)) {
+#ifdef _CG_DEBUG
+        if (num_threads() > MaxBlockSize) {
+            details::abort();
+        }
+#endif
+
+
+#if defined(_CG_USER_PROVIDED_SHARED_MEMORY)
+#define _CG_SKIP_BARRIER_INIT_TARGET NV_NO_TARGET
+#else
+#define _CG_SKIP_BARRIER_INIT_TARGET NV_PROVIDES_SM_80
+#endif
+        NV_IF_ELSE_TARGET(
+            _CG_SKIP_BARRIER_INIT_TARGET,
+            // skip if clause
+        ,
+            (tile_memory->init_barriers(thread_rank());
+            sync();)
+        )
+    }
+#endif
+#undef _CG_SKIP_BARRIER_INIT_TARGET
+
+    // Disable constructor
+    _CG_QUALIFIER thread_block()
+#if defined(_CG_CPP11_FEATURES)
+    : tile_memory(details::get_scratch_ptr(NULL))
+#endif
+    { }
+
+    // Internal Use
+    _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
+        const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
+
+        // Invalid, immediately fail
+        if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
+            details::abort();
+            return (thread_block());
+        }
+
+        unsigned int mask;
+        unsigned int base_offset = thread_rank() & (~(tilesz - 1));
+        unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
+
+        mask = (unsigned int)(-1) >> (32 - masklength);
+        mask <<= (details::laneid() & ~(tilesz - 1));
+        thread_group tile = thread_group(details::coalesced_group_id);
+        tile._data.coalesced.mask = mask;
+        tile._data.coalesced.size = __popc(mask);
+        tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz;
+        tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz;
+        tile._data.coalesced.is_tiled = true;
+        return (tile);
+    }
+
+ public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id;
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
+
+    _CG_STATIC_QUALIFIER void sync() {
+        details::cta::sync();
+    }
+
+#if defined(_CG_CPP11_FEATURES)
+    struct arrival_token {};
+
+    _CG_QUALIFIER arrival_token barrier_arrive() const {
+        return arrival_token();
+    }
+
+    _CG_QUALIFIER void barrier_wait(arrival_token&&) const {
+        details::cta::sync();
+    }
+#endif
+
+    _CG_STATIC_QUALIFIER unsigned int size() {
+        return details::cta::size();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int thread_rank() {
+        return details::cta::thread_rank();
+    }
+
+    // Additional functionality exposed by the group
+    _CG_STATIC_QUALIFIER dim3 group_index() {
+        return details::cta::group_index();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 thread_index() {
+        return details::cta::thread_index();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 group_dim() {
+        return details::cta::block_dim();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_threads() {
+        return details::cta::dim_threads();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int num_threads() {
+        return details::cta::num_threads();
+    }
+
+};
+
+/**
+ * thread_block this_thread_block()
+ *
+ * Constructs a thread_block group
+ */
+_CG_QUALIFIER thread_block this_thread_block()
+{
+    return (thread_block());
+}
+
+#if defined(_CG_CPP11_FEATURES)
+template <unsigned int MaxBlockSize>
+_CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch) {
+    return (thread_block(scratch));
+}
+#endif
+
+/**
+ * class coalesced_group
+ *
+ * A group representing the current set of converged threads in a warp.
+ * The size of the group is not guaranteed and it may return a group of
+ * only one thread (itself).
+ *
+ * This group exposes warp-synchronous builtins.
+ * Constructed via coalesced_threads();
+ */
+class coalesced_group : public thread_group_base<details::coalesced_group_id>
+{
+private:
+    friend _CG_QUALIFIER coalesced_group coalesced_threads();
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
+    friend class details::_coalesced_group_data_access;
+
+    _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
+        unsigned int member_pack = 0;
+        unsigned int member_rank = 0;
+        for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
+            unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
+            if (lane_bit) {
+                if (laneMask & lane_bit)
+                    member_pack |= 1 << member_rank;
+                member_rank++;
+            }
+        }
+        return (member_pack);
+    }
+
+    // Internal Use
+    _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
+        const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
+
+        // Invalid, immediately fail
+        if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
+            details::abort();
+            return (coalesced_group(0));
+        }
+        if (size() <= tilesz) {
+            return (*this);
+        }
+
+        if ((_data.coalesced.is_tiled == true) && pow2_tilesz) {
+            unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
+            unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
+            unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
+
+            mask <<= (details::laneid() & ~(tilesz - 1));
+            coalesced_group coalesced_tile = coalesced_group(mask);
+            coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz;
+            coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
+            coalesced_tile._data.coalesced.is_tiled = true;
+            return (coalesced_tile);
+        }
+        else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) {
+            unsigned int mask = 0;
+            unsigned int member_rank = 0;
+            int seen_lanes = (thread_rank() / tilesz) * tilesz;
+            for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
+                unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
+                if (lane_bit) {
+                    if (seen_lanes <= 0 && member_rank < tilesz) {
+                        mask |= lane_bit;
+                        member_rank++;
+                    }
+                    seen_lanes--;
+                }
+            }
+            coalesced_group coalesced_tile = coalesced_group(mask);
+            // Override parent with the size of this group
+            coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz;
+            coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
+            return coalesced_tile;
+        }
+        else {
+            // None in _CG_VERSION 1000
+            details::abort();
+        }
+
+        return (coalesced_group(0));
+    }
+
+ protected:
+    _CG_QUALIFIER coalesced_group(unsigned int mask) {
+        _data.coalesced.mask = mask;
+        _data.coalesced.size = __popc(mask);
+        _data.coalesced.metaGroupRank = 0;
+        _data.coalesced.metaGroupSize = 1;
+        _data.coalesced.is_tiled = false;
+    }
+
+    _CG_QUALIFIER unsigned int get_mask() const {
+        return (_data.coalesced.mask);
+    }
+
+ public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
+
+    _CG_QUALIFIER unsigned int num_threads() const {
+        return _data.coalesced.size;
+    }
+
+    _CG_QUALIFIER unsigned int size() const {
+        return num_threads();
+    }
+
+    _CG_QUALIFIER unsigned int thread_rank() const {
+        return (__popc(_data.coalesced.mask & details::lanemask32_lt()));
+    }
+
+    // Rank of this group in the upper level of the hierarchy
+    _CG_QUALIFIER unsigned int meta_group_rank() const {
+        return _data.coalesced.metaGroupRank;
+    }
+
+    // Total num partitions created out of all CTAs when the group was created
+    _CG_QUALIFIER unsigned int meta_group_size() const {
+        return _data.coalesced.metaGroupSize;
+    }
+
+    _CG_QUALIFIER void sync() const {
+        __syncwarp(_data.coalesced.mask);
+    }
+
+#ifdef _CG_CPP11_FEATURES
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
+        unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 :
+            (size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1));
+
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
+        if (size() == 32) {
+            return details::tile::shuffle_dispatch<TyElem>::shfl_down(
+                _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
+        }
+
+        unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
+
+        if (lane >= 32)
+            lane = details::laneid();
+
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const {
+        if (size() == 32) {
+            return details::tile::shuffle_dispatch<TyElem>::shfl_up(
+                _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
+        }
+
+        unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
+        if (lane >= 32)
+            lane = details::laneid();
+
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
+    }
+#else
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 :
+            (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1));
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32));
+        }
+        unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
+        if (lane >= 32) lane = details::laneid();
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32));
+        }
+        unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
+        if (lane >= 32) lane = details::laneid();
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
+    }
+#endif
+
+    _CG_QUALIFIER int any(int predicate) const {
+        return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
+    }
+    _CG_QUALIFIER int all(int predicate) const {
+        return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
+    }
+    _CG_QUALIFIER unsigned int ballot(int predicate) const {
+        if (size() == 32) {
+            return (__ballot_sync(0xFFFFFFFF, predicate));
+        }
+        unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
+        return (_packLanes(lane_ballot));
+    }
+
+#ifdef _CG_HAS_MATCH_COLLECTIVE
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__match_any_sync(0xFFFFFFFF, val));
+        }
+        unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val);
+        return (_packLanes(lane_match));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__match_all_sync(0xFFFFFFFF, val, &pred));
+        }
+        unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred);
+        return (_packLanes(lane_match));
+    }
+
+#endif /* !_CG_HAS_MATCH_COLLECTIVE */
+
+};
+
+_CG_QUALIFIER coalesced_group coalesced_threads()
+{
+    return (coalesced_group(__activemask()));
+}
+
+namespace details {
+    template <unsigned int Size> struct verify_thread_block_tile_size;
+    template <> struct verify_thread_block_tile_size<32> { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<16> { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<8>  { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<4>  { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<2>  { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<1>  { typedef void OK; };
+
+#ifdef _CG_CPP11_FEATURES
+    template <unsigned int Size>
+    using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant<bool, (Size & (Size - 1)) == 0>;
+
+    template <unsigned int Size>
+    using _is_single_warp = _CG_STL_NAMESPACE::integral_constant<bool, Size <= 32>;
+    template <unsigned int Size>
+    using _is_multi_warp =
+    _CG_STL_NAMESPACE::integral_constant<bool, (Size > 32) && (Size <= 1024)>;
+
+    template <unsigned int Size>
+    using _is_valid_single_warp_tile =
+        _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_single_warp<Size>::value>;
+    template <unsigned int Size>
+    using _is_valid_multi_warp_tile =
+        _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_multi_warp<Size>::value>;
+#else
+    template <unsigned int Size>
+    struct _is_multi_warp {
+        static const bool value = false;
+    };
+#endif
+}
+
+template <unsigned int Size>
+class __static_size_tile_base
+{
+protected:
+    _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
+
+    // Rank of thread within tile
+    _CG_STATIC_QUALIFIER unsigned int thread_rank() {
+        return (details::cta::thread_rank() & (numThreads - 1));
+    }
+
+    // Number of threads within tile
+    _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() {
+        return numThreads;
+    }
+
+    _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() {
+        return num_threads();
+    }
+};
+
+template <unsigned int Size>
+class __static_size_thread_block_tile_base : public __static_size_tile_base<Size>
+{
+    friend class details::_coalesced_group_data_access;
+    typedef details::tile::tile_helpers<Size> th;
+
+#ifdef _CG_CPP11_FEATURES
+    static_assert(details::_is_valid_single_warp_tile<Size>::value, "Size must be one of 1/2/4/8/16/32");
+#else
+    typedef typename details::verify_thread_block_tile_size<Size>::OK valid;
+#endif
+    using __static_size_tile_base<Size>::numThreads;
+    _CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF;
+
+ protected:
+    _CG_STATIC_QUALIFIER unsigned int build_mask() {
+        unsigned int mask = fullMask;
+        if (numThreads != 32) {
+            // [0,31] representing the current active thread in the warp
+            unsigned int laneId = details::laneid();
+            // shift mask according to the partition it belongs to
+            mask = th::tileMask << (laneId & ~(th::laneMask));
+        }
+        return (mask);
+    }
+
+public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
+
+    _CG_STATIC_QUALIFIER void sync() {
+        __syncwarp(build_mask());
+    }
+
+#ifdef _CG_CPP11_FEATURES
+    // PTX supported collectives
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), srcRank, numThreads);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl_down(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl_up(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl_xor(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), laneMask, numThreads);
+    }
+#else
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_sync(build_mask(), var, srcRank, numThreads));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_down_sync(build_mask(), var, delta, numThreads));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_up_sync(build_mask(), var, delta, numThreads));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
+    }
+#endif //_CG_CPP11_FEATURES
+
+    _CG_QUALIFIER int any(int predicate) const {
+        unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
+        return (lane_ballot != 0);
+    }
+    _CG_QUALIFIER int all(int predicate) const {
+        unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
+        return (lane_ballot == build_mask());
+    }
+    _CG_QUALIFIER unsigned int ballot(int predicate) const {
+        unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
+        return (lane_ballot >> (details::laneid() & (~(th::laneMask))));
+    }
+
+#ifdef _CG_HAS_MATCH_COLLECTIVE
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        unsigned int lane_match = __match_any_sync(build_mask(), val);
+        return (lane_match >> (details::laneid() & (~(th::laneMask))));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        unsigned int lane_match = __match_all_sync(build_mask(), val, &pred);
+        return (lane_match >> (details::laneid() & (~(th::laneMask))));
+    }
+#endif
+
+};
+
+template <unsigned int Size, typename ParentT>
+class __static_parent_thread_block_tile_base
+{
+public:
+    // Rank of this group in the upper level of the hierarchy
+    _CG_STATIC_QUALIFIER unsigned int meta_group_rank() {
+        return ParentT::thread_rank() / Size;
+    }
+
+    // Total num partitions created out of all CTAs when the group was created
+    _CG_STATIC_QUALIFIER unsigned int meta_group_size() {
+        return (ParentT::size() + Size - 1) / Size;
+    }
+};
+
+/**
+ * class thread_block_tile<unsigned int Size, ParentT = void>
+ *
+ * Statically-sized group type, representing one tile of a thread block.
+ * The only specializations currently supported are those with native
+ * hardware support (1/2/4/8/16/32)
+ *
+ * This group exposes warp-synchronous builtins.
+ * Can only be constructed via tiled_partition<Size>(ParentT&)
+ */
+
+template <unsigned int Size, typename ParentT = void>
+class __single_warp_thread_block_tile :
+    public __static_size_thread_block_tile_base<Size>,
+    public __static_parent_thread_block_tile_base<Size, ParentT>
+{
+    typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
+    friend class details::_coalesced_group_data_access;
+
+protected:
+    _CG_QUALIFIER __single_warp_thread_block_tile() { };
+    _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { };
+
+    _CG_STATIC_QUALIFIER unsigned int get_mask() {
+        return __static_size_thread_block_tile_base<Size>::build_mask();
+    }
+};
+
+template <unsigned int Size>
+class __single_warp_thread_block_tile<Size, void> :
+    public __static_size_thread_block_tile_base<Size>,
+    public thread_group_base<details::coalesced_group_id>
+{
+    _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
+
+    template <unsigned int, typename ParentT> friend class __single_warp_thread_block_tile;
+    friend class details::_coalesced_group_data_access;
+
+    typedef __static_size_thread_block_tile_base<numThreads> staticSizeBaseT;
+
+protected:
+    _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank = 0, unsigned int meta_group_size = 1) {
+        _data.coalesced.mask = staticSizeBaseT::build_mask();
+        _data.coalesced.size = numThreads;
+        _data.coalesced.metaGroupRank = meta_group_rank;
+        _data.coalesced.metaGroupSize = meta_group_size;
+        _data.coalesced.is_tiled = true;
+    }
+
+    _CG_QUALIFIER unsigned int get_mask() const {
+        return (_data.coalesced.mask);
+    }
+
+public:
+    using staticSizeBaseT::sync;
+    using staticSizeBaseT::size;
+    using staticSizeBaseT::num_threads;
+    using staticSizeBaseT::thread_rank;
+
+    _CG_QUALIFIER unsigned int meta_group_rank() const {
+        return _data.coalesced.metaGroupRank;
+    }
+
+    _CG_QUALIFIER unsigned int meta_group_size() const {
+        return _data.coalesced.metaGroupSize;
+    }
+};
+
+/**
+ * Outer level API calls
+ * void sync(GroupT) - see <group_type>.sync()
+ * void thread_rank(GroupT) - see <group_type>.thread_rank()
+ * void group_size(GroupT) - see <group_type>.size()
+ */
+template <class GroupT>
+_CG_QUALIFIER void sync(GroupT const &g)
+{
+    g.sync();
+}
+
+// TODO: Use a static dispatch to determine appropriate return type
+// C++03 is stuck with unsigned long long for now
+#ifdef _CG_CPP11_FEATURES
+template <class GroupT>
+_CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) {
+    return g.thread_rank();
+}
+
+
+template <class GroupT>
+_CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) {
+    return g.num_threads();
+}
+#else
+template <class GroupT>
+_CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) {
+    return static_cast<unsigned long long>(g.thread_rank());
+}
+
+
+template <class GroupT>
+_CG_QUALIFIER unsigned long long group_size(GroupT const &g) {
+    return static_cast<unsigned long long>(g.num_threads());
+}
+#endif
+
+
+/**
+ * tiled_partition
+ *
+ * The tiled_partition(parent, tilesz) method is a collective operation that
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
+ *
+ * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
+ * be created where threads having identical k = (thread_rank(parent)/tilesz)
+ * will be members of the same subgroup.
+ *
+ * The implementation may cause the calling thread to wait until all the members
+ * of the parent group have invoked the operation before resuming execution.
+ *
+ * Functionality is limited to power-of-two sized subgorup instances of at most
+ * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
+ * tiled_partition() in _CG_VERSION 1000.
+ */
+_CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
+{
+    if (parent.get_type() == details::coalesced_group_id) {
+        const coalesced_group *_cg = static_cast<const coalesced_group*>(&parent);
+        return _cg->_get_tiled_threads(tilesz);
+    }
+    else {
+        const thread_block *_tb = static_cast<const thread_block*>(&parent);
+        return _tb->_get_tiled_threads(tilesz);
+    }
+}
+
+// Thread block type overload: returns a basic thread_group for now (may be specialized later)
+_CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
+{
+    return (parent._get_tiled_threads(tilesz));
+}
+
+// Coalesced group type overload: retains its ability to stay coalesced
+_CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
+{
+    return (parent._get_tiled_threads(tilesz));
+}
+
+namespace details {
+    template <unsigned int Size, typename ParentT>
+    class internal_thread_block_tile : public __single_warp_thread_block_tile<Size, ParentT> {};
+
+    template <unsigned int Size, typename ParentT>
+    _CG_QUALIFIER internal_thread_block_tile<Size, ParentT> tiled_partition_internal() {
+        return internal_thread_block_tile<Size, ParentT>();
+    }
+
+    template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
+    _CG_QUALIFIER TyVal multi_warp_collectives_helper(
+            const GroupT& group,
+            WarpLambda warp_lambda,
+            InterWarpLambda inter_warp_lambda) {
+                return group.template collectives_scheme<TyVal>(warp_lambda, inter_warp_lambda);
+            }
+
+    template <typename T, typename GroupT>
+    _CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) {
+        return group.template get_scratch_location<T>(warp_id);
+    }
+
+    template <typename GroupT>
+    _CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) {
+        return group.get_sync_location();
+    }
+
+}
+/**
+ * tiled_partition<tilesz>
+ *
+ * The tiled_partition<tilesz>(parent) method is a collective operation that
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
+ *
+ * A total of ((size(parent)/tilesz) subgroups will be created,
+ * therefore the parent group size must be evenly divisible by the tilesz.
+ * The allow parent groups are thread_block or thread_block_tile<size>.
+ *
+ * The implementation may cause the calling thread to wait until all the members
+ * of the parent group have invoked the operation before resuming execution.
+ *
+ * Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
+ * The size(parent) must be greater than the template Size parameter
+ * otherwise the results are undefined.
+ */
+
+#if defined(_CG_CPP11_FEATURES)
+template <unsigned int Size>
+class __static_size_multi_warp_tile_base : public __static_size_tile_base<Size>
+{
+    static_assert(details::_is_valid_multi_warp_tile<Size>::value, "Size must be one of 64/128/256/512");
+
+    template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
+    friend __device__ TyVal details::multi_warp_collectives_helper(
+            const GroupT& group,
+            WarpLambda warp_lambda,
+            InterWarpLambda inter_warp_lambda);
+    template <typename T, typename GroupT>
+    friend __device__ T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id);
+    template <typename GroupT>
+    friend __device__ details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group);
+    template <unsigned int OtherSize>
+    friend class __static_size_multi_warp_tile_base;
+    using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+    using ThisType = __static_size_multi_warp_tile_base<Size>;
+    _CG_STATIC_CONST_DECL int numWarps = Size / 32;
+
+protected:
+    details::multi_warp_scratch* const tile_memory;
+
+    template <typename GroupT>
+    _CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) : tile_memory(g.tile_memory) {
+#if !defined(_CG_USER_PROVIDED_SHARED_MEMORY)
+        NV_IF_TARGET(NV_PROVIDES_SM_80,
+            details::sync_warps_reset(get_sync_location(), details::cta::thread_rank());
+            g.sync();
+        )
+#endif
+    }
+
+
+private:
+    _CG_QUALIFIER details::barrier_t* get_sync_location() const {
+        // Different group sizes use different barriers, all groups of a given size share one barrier.
+        unsigned int sync_id = details::log2(Size / 64);
+        return &tile_memory->barriers[sync_id];
+    }
+
+    template <typename T>
+    _CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const {
+        unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id;
+        return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
+    }
+
+    template <typename T>
+    _CG_QUALIFIER T* get_scratch_location() const {
+        unsigned int scratch_id = details::cta::thread_rank() / 32;
+        return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
+    }
+
+    template <typename TyVal>
+    _CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const {
+        unsigned int src_warp = src / 32;
+        auto warp = details::tiled_partition_internal<32, ThisType>();
+        details::barrier_t* sync_location = get_sync_location();
+
+        // Get warp slot of the source threads warp.
+        TyVal* warp_scratch_location = get_scratch_location<TyVal>(src_warp);
+
+        if (warp.meta_group_rank() == src_warp) {
+            warp.sync();
+            // Put shuffled value into my warp slot and let my warp arrive at the barrier.
+            if (thread_rank() == src) {
+                *warp_scratch_location = val;
+            }
+            details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps);
+            TyVal result = *warp_scratch_location;
+            details::sync_warps_wait(sync_location, details::cta::thread_rank());
+            return result;
+        }
+        else {
+            // Wait for the source warp to arrive on the barrier.
+            details::sync_warps_wait_for_specific_warp(sync_location,
+                    (details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp));
+            TyVal result = *warp_scratch_location;
+            details::sync_warps(sync_location, details::cta::thread_rank(), numWarps);
+            return result;
+        }
+    }
+
+    template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
+    _CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
+        static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
+                      "Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
+        auto warp = details::tiled_partition_internal<32, ThisType>();
+        details::barrier_t* sync_location = get_sync_location();
+        TyVal* warp_scratch_location = get_scratch_location<TyVal>();
+
+        warp_lambda(warp, warp_scratch_location);
+
+        if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) {
+            auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
+            if (subwarp.meta_group_rank() == 0) {
+                TyVal* thread_scratch_location = get_scratch_location<TyVal>(subwarp.thread_rank());
+                inter_warp_lambda(subwarp, thread_scratch_location);
+            }
+            warp.sync();
+            details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps);
+        }
+        TyVal result = *warp_scratch_location;
+        return result;
+    }
+
+public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id;
+
+    using __static_size_tile_base<Size>::thread_rank;
+
+    template <typename TyVal>
+    _CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const {
+        static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
+                      "Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
+        return shfl_impl(val, src);
+    }
+
+    _CG_QUALIFIER void sync() const {
+        details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps);
+    }
+
+    _CG_QUALIFIER int any(int predicate) const {
+        auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
+                *warp_scratch_location = __any_sync(0xFFFFFFFF, predicate);
+        };
+        auto inter_warp_lambda =
+            [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
+                *thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
+        };
+        return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
+    }
+
+    _CG_QUALIFIER int all(int predicate) const {
+        auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
+                *warp_scratch_location = __all_sync(0xFFFFFFFF, predicate);
+        };
+        auto inter_warp_lambda =
+            [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
+                *thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
+        };
+        return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
+    }
+};
+
+
+template <unsigned int Size, typename ParentT = void>
+class __multi_warp_thread_block_tile :
+    public __static_size_multi_warp_tile_base<Size>,
+    public __static_parent_thread_block_tile_base<Size, ParentT>
+{
+    typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
+    typedef __static_size_multi_warp_tile_base<Size> staticTileBaseT;
+protected:
+    _CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) :
+        __static_size_multi_warp_tile_base<Size>(g) {}
+};
+
+template <unsigned int Size>
+class __multi_warp_thread_block_tile<Size, void> : public __static_size_multi_warp_tile_base<Size>
+{
+    const unsigned int metaGroupRank;
+    const unsigned int metaGroupSize;
+
+protected:
+    template <unsigned int OtherSize, typename ParentT>
+    _CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile<OtherSize, ParentT>& g) :
+        __static_size_multi_warp_tile_base<Size>(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {}
+
+public:
+    _CG_QUALIFIER unsigned int meta_group_rank() const {
+        return metaGroupRank;
+    }
+
+    _CG_QUALIFIER unsigned int meta_group_size() const {
+        return metaGroupSize;
+    }
+};
+#endif
+
+template <unsigned int Size, typename ParentT = void>
+class thread_block_tile;
+
+namespace details {
+    template <unsigned int Size, typename ParentT, bool IsMultiWarp>
+    class thread_block_tile_impl;
+
+    template <unsigned int Size, typename ParentT>
+    class thread_block_tile_impl<Size, ParentT, false>: public __single_warp_thread_block_tile<Size, ParentT>
+    {
+    protected:
+        template <unsigned int OtherSize, typename OtherParentT, bool OtherIsMultiWarp>
+        _CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl<OtherSize, OtherParentT, OtherIsMultiWarp>& g) :
+            __single_warp_thread_block_tile<Size, ParentT>(g.meta_group_rank(), g.meta_group_size()) {}
+
+        _CG_QUALIFIER thread_block_tile_impl(const thread_block& g) :
+            __single_warp_thread_block_tile<Size, ParentT>() {}
+    };
+
+#if defined(_CG_CPP11_FEATURES)
+    template <unsigned int Size, typename ParentT>
+    class thread_block_tile_impl<Size, ParentT, true> : public __multi_warp_thread_block_tile<Size, ParentT>
+    {
+        protected:
+        template <typename GroupT>
+        _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) :
+            __multi_warp_thread_block_tile<Size, ParentT>(g) {}
+    };
+#else
+    template <unsigned int Size, typename ParentT>
+    class thread_block_tile_impl<Size, ParentT, true>
+    {
+        protected:
+        template <typename GroupT>
+        _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {}
+    };
+#endif
+}
+
+template <unsigned int Size, typename ParentT>
+class thread_block_tile : public details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>
+{
+    friend _CG_QUALIFIER thread_block_tile<1, void> this_thread();
+
+protected:
+    _CG_QUALIFIER thread_block_tile(const ParentT& g) :
+        details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>(g) {}
+
+public:
+    _CG_QUALIFIER operator thread_block_tile<Size, void>() const {
+        return thread_block_tile<Size, void>(*this);
+    }
+};
+
+template <unsigned int Size>
+class thread_block_tile<Size, void> : public details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>
+{
+    template <unsigned int, typename ParentT>
+    friend class thread_block_tile;
+
+protected:
+    template <unsigned int OtherSize, typename OtherParentT>
+    _CG_QUALIFIER thread_block_tile(const thread_block_tile<OtherSize, OtherParentT>& g) :
+        details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
+
+public:
+    template <typename ParentT>
+    _CG_QUALIFIER thread_block_tile(const thread_block_tile<Size, ParentT>& g) :
+        details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
+};
+
+namespace details {
+    template <unsigned int Size, typename ParentT>
+    struct tiled_partition_impl;
+
+    template <unsigned int Size>
+    struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size, thread_block> {
+        _CG_QUALIFIER tiled_partition_impl(const thread_block& g) :
+            thread_block_tile<Size, thread_block>(g) {}
+    };
+
+    // ParentT = static thread_block_tile<ParentSize, GrandParent> specialization
+    template <unsigned int Size, unsigned int ParentSize, typename GrandParent>
+    struct tiled_partition_impl<Size, thread_block_tile<ParentSize, GrandParent> > :
+        public thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> > {
+#ifdef _CG_CPP11_FEATURES
+        static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size");
+#endif
+        _CG_QUALIFIER tiled_partition_impl(const thread_block_tile<ParentSize, GrandParent>& g) :
+            thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> >(g) {}
+    };
+
+}
+
+template <unsigned int Size, typename ParentT>
+_CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
+{
+    return details::tiled_partition_impl<Size, ParentT>(g);
+}
+
+/**
+ * thread_group this_thread()
+ *
+ * Constructs a generic thread_group containing only the calling thread
+ */
+_CG_QUALIFIER thread_block_tile<1, void> this_thread()
+{
+    // Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its
+    // meta group rank and size set to 0 and 1 respectively.
+    return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block());
+}
+
+/**
+ * <group_type>.sync()
+ *
+ * Executes a barrier across the group
+ *
+ * Implements both a compiler fence and an architectural fence to prevent,
+ * memory reordering around the barrier.
+ */
+_CG_QUALIFIER void thread_group::sync() const
+{
+    switch (_data.group.type) {
+    case details::coalesced_group_id:
+        cooperative_groups::sync(*static_cast<const coalesced_group*>(this));
+        break;
+    case details::thread_block_id:
+        cooperative_groups::sync(*static_cast<const thread_block*>(this));
+        break;
+    case details::grid_group_id:
+        cooperative_groups::sync(*static_cast<const grid_group*>(this));
+        break;
+#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    case details::multi_grid_group_id:
+        cooperative_groups::sync(*static_cast<const multi_grid_group*>(this));
+        break;
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    case details::cluster_group_id:
+        cooperative_groups::sync(*static_cast<const cluster_group*>(this));
+        break;
+#endif
+    default:
+        break;
+    }
+}
+
+/**
+ * <group_type>.size()
+ *
+ * Returns the total number of threads in the group.
+ */
+_CG_QUALIFIER unsigned long long thread_group::size() const
+{
+    unsigned long long size = 0;
+    switch (_data.group.type) {
+    case details::coalesced_group_id:
+        size = cooperative_groups::group_size(*static_cast<const coalesced_group*>(this));
+        break;
+    case details::thread_block_id:
+        size = cooperative_groups::group_size(*static_cast<const thread_block*>(this));
+        break;
+    case details::grid_group_id:
+        size = cooperative_groups::group_size(*static_cast<const grid_group*>(this));
+        break;
+#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    case details::multi_grid_group_id:
+        size = cooperative_groups::group_size(*static_cast<const multi_grid_group*>(this));
+        break;
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    case details::cluster_group_id:
+        size = cooperative_groups::group_size(*static_cast<const cluster_group*>(this));
+        break;
+#endif
+    default:
+        break;
+    }
+    return size;
+}
+
+/**
+ * <group_type>.thread_rank()
+ *
+ * Returns the linearized rank of the calling thread along the interval [0, size()).
+ */
+_CG_QUALIFIER unsigned long long thread_group::thread_rank() const
+{
+    unsigned long long rank = 0;
+    switch (_data.group.type) {
+    case details::coalesced_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const coalesced_group*>(this));
+        break;
+    case details::thread_block_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const thread_block*>(this));
+        break;
+    case details::grid_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const grid_group*>(this));
+        break;
+#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    case details::multi_grid_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const multi_grid_group*>(this));
+        break;
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    case details::cluster_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const cluster_group*>(this));
+        break;
+#endif
+    default:
+        break;
+    }
+    return rank;
+}
+
+_CG_END_NAMESPACE
+
+#include <cooperative_groups/details/partitioning.h>
+#if (!defined(_MSC_VER) || defined(_WIN64))
+# include <cooperative_groups/details/invoke.h>
+#endif
+
+# endif /* ! (__cplusplus, __CUDACC__) */
+
+#endif /* !_COOPERATIVE_GROUPS_H_ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/common_functions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/common_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7e70950fb51d0d58f8dd99239e6b36ba89c4779
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/common_functions.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/common_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/common_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__
+#endif
+
+#if !defined(__COMMON_FUNCTIONS_H__)
+#define __COMMON_FUNCTIONS_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#include "builtin_types.h"
+#include "host_defines.h"
+
+#define __CUDACC_VER__ "__CUDACC_VER__ is no longer supported.  Use __CUDACC_VER_MAJOR__, __CUDACC_VER_MINOR__, and __CUDACC_VER_BUILD__ instead."
+
+#ifndef __CUDA_API_VER_MAJOR__
+#define __CUDA_API_VER_MAJOR__ __CUDACC_VER_MAJOR__
+#endif /* __CUDA_API_VER_MAJOR__ */
+
+#ifndef __CUDA_API_VER_MINOR__
+#define __CUDA_API_VER_MINOR__ __CUDACC_VER_MINOR__
+#endif /* __CUDA_API_VER_MINOR__ */
+
+#if !defined(__CUDACC_RTC__)
+#include <string.h>
+#include <time.h>
+
+extern "C"
+{
+#endif /* !__CUDACC_RTC__ */
+extern _CRTIMP __host__ __device__ __device_builtin__ __cudart_builtin__ clock_t __cdecl clock(void)
+#if defined(__QNX__)
+asm("clock32")
+#endif
+__THROW;
+extern         __host__ __device__ __device_builtin__ __cudart_builtin__ void*   __cdecl memset(void*, int, size_t) __THROW;
+extern         __host__ __device__ __device_builtin__ __cudart_builtin__ void*   __cdecl memcpy(void*, const void*, size_t) __THROW;
+#if !defined(__CUDACC_RTC__)
+}
+#endif /* !__CUDACC_RTC__ */
+
+#if defined(__CUDA_ARCH__)
+
+#if defined(__CUDACC_RTC__)
+inline __host__ __device__ void* operator new(size_t, void *p) { return p; }
+inline __host__ __device__ void* operator new[](size_t, void *p) { return p; }
+inline __host__ __device__ void operator delete(void*, void*) { }
+inline __host__ __device__ void operator delete[](void*, void*) { }
+#else /* !__CUDACC_RTC__ */
+#ifndef __CUDA_INTERNAL_SKIP_CPP_HEADERS__
+#include <new>
+#endif
+
+#if defined (__GNUC__)
+
+#define STD \
+        std::
+        
+#else /* __GNUC__ */
+
+#define STD
+
+#endif /* __GNUC__ */
+
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new(STD size_t, void*) throw();
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new[](STD size_t, void*) throw();
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, void*) throw();
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, void*) throw();
+# if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, STD size_t) throw();
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, STD size_t) throw();
+#endif /* __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__)  || defined(__CUDA_ICC_CPP14__) */
+#endif /* __CUDACC_RTC__ */
+
+#if !defined(__CUDACC_RTC__)
+#include <stdio.h>
+#include <stdlib.h>
+#endif /* !__CUDACC_RTC__ */
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+extern "C"
+{
+extern
+#if !defined(_MSC_VER) || _MSC_VER < 1900
+_CRTIMP
+#endif
+            
+#if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && ( (__GLIBC__ < 2) || ( (__GLIBC__ == 2) && (__GLIBC_MINOR__ < 3) ) ) 
+__host__ __device__ __device_builtin__ __cudart_builtin__ int     __cdecl printf(const char*, ...) __THROW;
+#else /* newer glibc */
+__host__ __device__ __device_builtin__ __cudart_builtin__ int     __cdecl printf(const char*, ...);
+#endif /* defined(__GLIBC__) && defined(__GLIBC_MINOR__) && ( (__GLIBC__ < 2) || ( (__GLIBC__ == 2) && (__GLIBC_MINOR__ < 3) ) ) */
+
+
+extern _CRTIMP __host__ __device__ __cudart_builtin__ void*   __cdecl malloc(size_t) __THROW;
+extern _CRTIMP __host__ __device__ __cudart_builtin__ void    __cdecl free(void*) __THROW;
+
+#if defined(_MSC_VER)
+extern  __host__ __device__ __cudart_builtin__ void*   __cdecl _alloca(size_t);
+#endif
+
+#if defined(__QNX__)
+#undef alloca
+#define alloca(__S) __builtin_alloca(__S)
+#endif
+}
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+#if !defined(__CUDACC_RTC__)
+#include <assert.h>
+#endif /* !__CUDACC_RTC__ */
+
+extern "C"
+{
+#if defined(__CUDACC_RTC__)
+extern __host__ __device__ void __assertfail(const char * __assertion, 
+                                             const char *__file,
+                                             unsigned int __line,
+                                             const char *__function,
+                                             size_t charsize);
+#elif defined(__APPLE__)
+#define __builtin_expect(exp,c) (exp)
+extern __host__ __device__ __cudart_builtin__ void __assert_rtn(
+  const char *, const char *, int, const char *);
+#elif defined(__ANDROID__)
+extern __host__ __device__ __cudart_builtin__ void __assert2(
+  const char *, int, const char *, const char *);
+#elif defined(__QNX__)
+#if !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+extern __host__ __device__ __cudart_builtin__ void __assert(
+  const char *, const char *, unsigned int, const char *);
+#if !defined(_LIBCPP_VERSION)
+}
+#endif
+#elif defined(__HORIZON__)
+extern __host__ __device__ __cudart_builtin__ void __assert_fail(
+  const char *, const char *, int, const char *);
+#elif defined(__GNUC__)
+extern __host__ __device__ __cudart_builtin__ void __assert_fail(
+  const char *, const char *, unsigned int, const char *)
+  __THROW; 
+#elif defined(_WIN32)
+extern __host__ __device__ __cudart_builtin__ _CRTIMP void __cdecl _wassert(
+  const wchar_t *, const wchar_t *, unsigned);
+#endif
+}
+
+#if defined(__CUDACC_RTC__)
+#ifdef NDEBUG
+#define assert(e) (static_cast<void>(0))
+#else /* !NDEBUG */
+#define __ASSERT_STR_HELPER(x) #x
+#define assert(e) ((e) ? static_cast<void>(0)\
+                       : __assertfail(__ASSERT_STR_HELPER(e), __FILE__,\
+                                      __LINE__, __PRETTY_FUNCTION__,\
+                                      sizeof(char)))
+#endif /* NDEBUG */
+__host__ __device__  void* operator new(size_t);
+__host__ __device__  void* operator new[](size_t);
+__host__ __device__  void operator delete(void*);
+__host__ __device__  void operator delete[](void*);
+# if __cplusplus >= 201402L
+__host__ __device__  void operator delete(void*, size_t);
+__host__ __device__  void operator delete[](void*, size_t);
+#endif /* __cplusplus >= 201402L */
+
+#if __cplusplus >= 201703L
+namespace std { enum class align_val_t : size_t {}; }
+__host__ __device__ void*   __cdecl operator new(size_t sz, std::align_val_t) noexcept;
+__host__ __device__ void*   __cdecl operator new[](size_t sz, std::align_val_t) noexcept;
+__host__ __device__ void    __cdecl operator delete(void* ptr, std::align_val_t) noexcept;
+__host__ __device__ void    __cdecl operator delete[](void* ptr, std::align_val_t) noexcept;
+__host__ __device__ void    __cdecl operator delete(void* ptr, size_t, std::align_val_t) noexcept;
+__host__ __device__ void    __cdecl operator delete[](void* ptr, size_t, std::align_val_t) noexcept;
+#endif  /* __cplusplus >= 201703L */
+
+#else /* !__CUDACC_RTC__ */
+#if defined (__GNUC__)
+
+#define __NV_GLIBCXX_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) 
+
+#if (__cplusplus >= 201103L)  && ((!(defined(__QNX__) && defined(_LIBCPP_VERSION))) || (defined(__QNX__) && __NV_GLIBCXX_VERSION >= 80300))
+#define THROWBADALLOC 
+#else
+#if defined(__ANDROID__) && !defined(_LIBCPP_VERSION) && (defined(__BIONIC__) || __NV_GLIBCXX_VERSION < 40900)
+#define THROWBADALLOC
+#else
+#define THROWBADALLOC  throw(STD bad_alloc)
+#endif
+#endif
+#define __DELETE_THROW throw()
+
+#undef __NV_GLIBCXX_VERSION
+
+#else /* __GNUC__ */
+
+#define THROWBADALLOC  throw(...)
+
+#endif /* __GNUC__ */
+
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new(STD size_t) THROWBADALLOC;
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new[](STD size_t) THROWBADALLOC;
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*) throw();
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*) throw();
+# if __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, STD size_t) throw();
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, STD size_t) throw();
+#endif /* __cplusplus >= 201402L || (defined(_MSC_VER) && _MSC_VER >= 1900) || defined(__CUDA_XLC_CPP14__) || defined(__CUDA_ICC_CPP14__)  */
+
+#if __cpp_aligned_new
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new(STD size_t, std::align_val_t);
+extern         __host__ __device__ __cudart_builtin__ void*   __cdecl operator new[](STD size_t, std::align_val_t);
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, std::align_val_t) noexcept;
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, std::align_val_t) noexcept;
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete(void*, STD size_t, std::align_val_t) noexcept;
+extern         __host__ __device__ __cudart_builtin__ void    __cdecl operator delete[](void*, STD size_t, std::align_val_t) noexcept;
+#endif  /* __cpp_aligned_new */
+
+#undef THROWBADALLOC
+#undef STD
+#endif /* __CUDACC_RTC__ */
+
+#endif /* __CUDA_ARCH__ */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC_RTC__) && (__CUDA_ARCH__ >= 350)
+#include "cuda_device_runtime_api.h"
+#endif
+
+#include "math_functions.h"
+
+#endif /* !__COMMON_FUNCTIONS_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/cudacc_ext.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/cudacc_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d244463e73f0f7569a4707002c8e059bca67c6d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/cudacc_ext.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2021-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/cudacc_ext.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/cudacc_ext.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDACC_EXT_H__
+#endif
+
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDACC_EXT_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDACC_EXT_H__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_double_functions.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_double_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f63063689d65c4a1dffb9a823ddaf6a5b353cba3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_double_functions.hpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/device_double_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/device_double_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__
+#endif
+
+#if !defined(__DEVICE_DOUBLE_FUNCTIONS_HPP__)
+#define __DEVICE_DOUBLE_FUNCTIONS_HPP__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ __device__
+#else
+#define __DEVICE_DOUBLE_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double fma(double a, double b, double c, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __fma_rz(a, b, c) :
+         mode == cudaRoundPosInf ? __fma_ru(a, b, c) :
+         mode == cudaRoundMinInf ? __fma_rd(a, b, c) :
+                                   __fma_rn(a, b, c);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dmul(double a, double b, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __dmul_rz(a, b) :
+         mode == cudaRoundPosInf ? __dmul_ru(a, b) :
+         mode == cudaRoundMinInf ? __dmul_rd(a, b) :
+                                   __dmul_rn(a, b);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dadd(double a, double b, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __dadd_rz(a, b) :
+         mode == cudaRoundPosInf ? __dadd_ru(a, b) :
+         mode == cudaRoundMinInf ? __dadd_rd(a, b) :
+                                   __dadd_rn(a, b);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double dsub(double a, double b, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __dsub_rz(a, b) :
+         mode == cudaRoundPosInf ? __dsub_ru(a, b) :
+         mode == cudaRoundMinInf ? __dsub_rd(a, b) :
+                                   __dsub_rn(a, b);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ int double2int(double a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundNearest ? __double2int_rn(a) :
+         mode == cudaRoundPosInf  ? __double2int_ru(a) :
+         mode == cudaRoundMinInf  ? __double2int_rd(a) :
+                                    __double2int_rz(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned int double2uint(double a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundNearest ? __double2uint_rn(a) :
+         mode == cudaRoundPosInf  ? __double2uint_ru(a) :
+         mode == cudaRoundMinInf  ? __double2uint_rd(a) :
+                                    __double2uint_rz(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ long long int double2ll(double a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundNearest ? __double2ll_rn(a) :
+         mode == cudaRoundPosInf  ? __double2ll_ru(a) :
+         mode == cudaRoundMinInf  ? __double2ll_rd(a) :
+                                    __double2ll_rz(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ unsigned long long int double2ull(double a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundNearest ? __double2ull_rn(a) :
+         mode == cudaRoundPosInf  ? __double2ull_ru(a) :
+         mode == cudaRoundMinInf  ? __double2ull_rd(a) :
+                                    __double2ull_rz(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ll2double(long long int a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __ll2double_rz(a) :
+         mode == cudaRoundPosInf ? __ll2double_ru(a) :
+         mode == cudaRoundMinInf ? __ll2double_rd(a) :
+                                   __ll2double_rn(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double ull2double(unsigned long long int a, enum cudaRoundMode mode)
+{
+  return mode == cudaRoundZero   ? __ull2double_rz(a) :
+         mode == cudaRoundPosInf ? __ull2double_ru(a) :
+         mode == cudaRoundMinInf ? __ull2double_rd(a) :
+                                   __ull2double_rn(a);
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double int2double(int a, enum cudaRoundMode mode)
+{
+  return (double)a;
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double uint2double(unsigned int a, enum cudaRoundMode mode)
+{
+  return (double)a;
+}
+
+__DEVICE_DOUBLE_FUNCTIONS_DECL__ double float2double(float a, enum cudaRoundMode mode)
+{
+  return (double)a;
+}
+
+#undef __DEVICE_DOUBLE_FUNCTIONS_DECL__
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__DEVICE_DOUBLE_FUNCTIONS_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_HPP__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_fp128_functions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_fp128_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..715220e121f790ab8ff2aeaed25620fe9759236f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_fp128_functions.h
@@ -0,0 +1,1217 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+// to easily switch off fp128 device functions if needed
+#ifndef __NV_DISABLE_DEVICE_FP128_FUNCTIONS__
+
+#if !defined(__DEVICE_FP128_FUNCTIONS_H__)
+#define __DEVICE_FP128_FUNCTIONS_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#define __INLINE_IF_HOST__ __inline__
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#define __INLINE_IF_HOST__
+#endif /* __CUDA_ARCH__ */
+
+#define __DEVICE_FP128_FUNCTIONS_DECL__ __device__ __cudart_builtin__ __INLINE_IF_HOST__
+
+/*******************************************************************************
+*                                                                              *
+* Support for __float128 on:                                                   *
+*    - NVRTC on Linux                                                          *
+*    - GCC version 4.1 or later on x86_64/amd64                                *
+*    - Clang version 3.9 or later on x86_64/amd64                              *
+*    - NVHPC version 21.1 or later on x86_64/amd64                             *
+*                                                                              *
+*******************************************************************************/
+#if defined(__CUDACC_RTC__)
+#if !_WIN64
+#define __FLOAT128_CPP_SPELLING_ENABLED__
+#endif
+#else /* !__CUDACC_RTC__ */
+
+#if (defined __NVCOMPILER_MAJOR__)
+    #if (defined(__x86_64__) || defined(__amd64__)) && \
+        ((__NVCOMPILER_MAJOR__ > 21) || \
+            (__NVCOMPILER_MAJOR__ == 21 && __NVCOMPILER_MINOR__ >= 1))
+        #define __FLOAT128_CPP_SPELLING_ENABLED__
+    #endif
+#elif defined(__clang__)
+    #if (defined(__x86_64__) || defined(__amd64__)) && \
+        ((__clang_major__ > 3) || \
+            (__clang_major__ == 3 && __clang_minor__ >= 9))
+        #define __FLOAT128_CPP_SPELLING_ENABLED__
+    #endif
+#elif defined(__GNUC__)
+    // check gcc version if no other host compiler is used
+    #if (defined(__x86_64__) || defined(__amd64__)) && \
+        ((__GNUC__ > 4) || \
+            (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
+        #define __FLOAT128_CPP_SPELLING_ENABLED__
+    #endif
+#endif /* (defined __NVCOMPILER_MAJOR__) */
+
+#endif /* !__CUDACC_RTC__ */
+
+/*******************************************************************************
+*                                                                              *
+* Support for _Float128 on:                                                    *
+*    - GCC version 13.1 or later on x86_64/amd64/aarch64                       *
+*                                                                              *
+*******************************************************************************/
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__NVCOMPILER_MAJOR__)
+    // check gcc version if no other host compiler is used
+    #if (defined(__x86_64__) || defined(__amd64__) || defined(__aarch64__)) && \
+        ((__GNUC__ > 13) || \
+            (__GNUC__ == 13 && __GNUC_MINOR__ >= 1))
+        #define __FLOAT128_C_SPELLING_ENABLED__
+    #endif
+#endif /* defined(__GNUC__) && !defined(__clang__) && !defined(__NVCOMPILER_MAJOR__) */
+
+/**
+ * \defgroup CUDA_MATH_QUAD FP128 Quad Precision Mathematical Functions
+ * This section describes quad precision mathematical functions.
+ * To use these functions, include the header file \p device_fp128_functions.h in your program.
+ * 
+ * Functions declared here have \p __nv_fp128_ prefix to distinguish them
+ * from other global namespace symbols.
+ *
+ * Note that FP128 CUDA Math functions are only available to device programs
+ * on platforms where host compiler supports the basic quad precision datatype
+ * \p __float128 or \p _Float128.
+ * 
+ * Every FP128 CUDA Math function name is overloaded to support either of these
+ * host-compiler-specific types, whenever the types are available. See for example:
+ * \code
+ * #ifdef __FLOAT128_CPP_SPELLING_ENABLED__
+ *     __float128 __nv_fp128_sqrt(__float128 x);
+ * #endif
+ * #ifdef __FLOAT128_C_SPELLING_ENABLED__
+ *     _Float128 __nv_fp128_sqrt(_Float128 x);
+ * #endif
+ * \endcode
+ *
+ * \note_fp128_target_arch
+ */
+
+#ifdef __FLOAT128_CPP_SPELLING_ENABLED__
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \sqrt{x} \end_cuda_math_formula, the square root of the input argument.
+ *
+ * \return 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ * - __nv_fp128_sqrt(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_sqrt(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_sqrt(\p x) returns NaN if \p x is less than 0.
+ * - __nv_fp128_sqrt(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_sqrt(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \sin{x} \end_cuda_math_formula, the sine of input argument (measured in radians).
+ * 
+ * \return 
+ * \cuda_math_formula \sin{x} \end_cuda_math_formula.
+ * - __nv_fp128_sin(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_sin(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - __nv_fp128_sin(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_sin(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \cos{x} \end_cuda_math_formula, the cosine of input argument (measured in radians).
+ * 
+ * \return 
+ * \cuda_math_formula \cos{x} \end_cuda_math_formula.
+ * - __nv_fp128_cos(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula 1 \end_cuda_math_formula.
+ * - __nv_fp128_cos(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - __nv_fp128_cos(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_cos(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \tan{x} \end_cuda_math_formula, the tangent of input argument (measured in radians).
+ * 
+ * \return 
+ * \cuda_math_formula \tan{x} \end_cuda_math_formula.
+ * - __nv_fp128_tan(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_tan(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - __nv_fp128_tan(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_tan(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \sin^{-1}{x} \end_cuda_math_formula, the arc sine of input argument.
+ * 
+ * \return 
+ * The principal value of the arc sine of the input argument \p x.
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * , +
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * ] for \p x inside [-1, +1].
+ * - __nv_fp128_asin(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_asin(\p x) returns NaN for \p x outside [-1, +1].
+ * - __nv_fp128_asin(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_asin(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \cos^{-1}{x} \end_cuda_math_formula, the arc cosine of input argument.
+ *
+ * \return 
+ * The principal value of the arc cosine of the input argument \p x.
+ * Result will be in radians, in the interval [0, 
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * ] for \p x inside [-1, +1].
+ * - __nv_fp128_acos(1) returns +0.
+ * - __nv_fp128_acos(\p x) returns NaN for \p x outside [-1, +1].
+ * - __nv_fp128_acos(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_acos(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \tan^{-1}{x} \end_cuda_math_formula, the arc tangent of input argument.
+ *
+ * \return 
+ * The principal value of the arc tangent of the input argument \p x.
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * , +
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * ].
+ * - __nv_fp128_atan(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_atan(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * /2.
+ * - __nv_fp128_atan(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_atan(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula e^x \end_cuda_math_formula, the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument.
+ *
+ * \return
+ * - __nv_fp128_exp(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - __nv_fp128_exp(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - __nv_fp128_exp(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_exp(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_exp(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula 2^x \end_cuda_math_formula, the base 2 exponential of the input argument.
+ *
+ * \return
+ * - __nv_fp128_exp2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - ex__nv_fp128_exp2p2f(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - __nv_fp128_exp2(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_exp2(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_exp2(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula 10^x \end_cuda_math_formula, the base 10 exponential of the input argument.
+ *
+ * \return
+ * - __nv_fp128_exp10(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - __nv_fp128_exp10(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - __nv_fp128_exp10(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_exp10(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_exp10(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate 
+ * \cuda_math_formula e^x - 1 \end_cuda_math_formula,
+ * the base e exponential of the input argument, minus 1.
+ *
+ * \return
+ * - __nv_fp128_expm1(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_expm1(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns -1.
+ * - __nv_fp128_expm1(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_expm1(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_expm1(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \log_{e}{x} \end_cuda_math_formula, the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  logarithm of the input argument.
+ *
+ * \return
+ * - __nv_fp128_log(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __nv_fp128_log(1) returns +0.
+ * - __nv_fp128_log(\p x) returns NaN for \p x < 0.
+ * - __nv_fp128_log(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_log(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_log(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \log_{2}{x} \end_cuda_math_formula, the base 2 logarithm of the input argument.
+ *
+ * \return 
+ * - __nv_fp128_log2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __nv_fp128_log2(1) returns +0.
+ * - __nv_fp128_log2(\p x) returns NaN for \p x < 0.
+ * - __nv_fp128_log2(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_log2(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_log2(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \log_{10}{x} \end_cuda_math_formula, the base 10 logarithm of the input argument.
+ *
+ * \return 
+ * - __nv_fp128_log10(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __nv_fp128_log10(1) returns +0.
+ * - __nv_fp128_log10(\p x) returns NaN for \p x < 0.
+ * - __nv_fp128_log10(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_log10(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_log10(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate the value of 
+ * \cuda_math_formula \log_{e}(1+x) \end_cuda_math_formula.
+ *
+ * \return
+ * - __nv_fp128_log1p(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_log1p(-1) returns
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __nv_fp128_log1p(\p x) returns NaN for \p x < -1.
+ * - __nv_fp128_log1p(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_log1p(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_log1p(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate the value of \cuda_math_formula x^{y} \end_cuda_math_formula, first argument to the power of second argument.
+ *
+ * \return 
+ * - __nv_fp128_pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *  for \p y an odd integer less than 0.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y less than 0 and not an odd integer.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  for \p y an odd integer greater than 0.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns +0 for \p y > 0 and not an odd integer.
+ * - __nv_fp128_pow(-1, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 1.
+ * - __nv_fp128_pow(+1, \p y) returns 1 for any \p y, even a NaN.
+ * - __nv_fp128_pow(\p x, 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1 for any \p x, even a NaN.
+ * - __nv_fp128_pow(\p x, \p y) returns a NaN for finite \p x < 0 and finite non-integer \p y.
+ * - __nv_fp128_pow(\p x, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for 
+ * \cuda_math_formula | x | < 1 \end_cuda_math_formula.
+ * - __nv_fp128_pow(\p x, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0 for 
+ * \cuda_math_formula | x | > 1 \end_cuda_math_formula.
+ * - __nv_fp128_pow(\p x, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0 for 
+ * \cuda_math_formula | x | < 1 \end_cuda_math_formula.
+ * - __nv_fp128_pow(\p x, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for 
+ * \cuda_math_formula | x | > 1 \end_cuda_math_formula.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns -0 for \p y an odd integer less than 0.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns +0 for \p y < 0 and not an odd integer.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ *  for \p y an odd integer greater than 0.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y > 0 and not an odd integer.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * , \p y) returns +0 for \p y < 0.
+ * - __nv_fp128_pow(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y > 0.
+ * - __nv_fp128_pow(\p x, \p y) returns NaN if either \p x or \p y or both are NaN and \p x \cuda_math_formula \neq \end_cuda_math_formula +1 and \p y \cuda_math_formula \neq\pm 0 \end_cuda_math_formula.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_pow(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \sinh{x} \end_cuda_math_formula, the hyperbolic sine of the input argument.
+ *
+ * Calculate \cuda_math_formula \sinh{x} \end_cuda_math_formula, the hyperbolic sine of the input argument \p x.
+ *
+ * \return
+ * - __nv_fp128_sinhinh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_sinh(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_sinh(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_sinh(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \cosh{x} \end_cuda_math_formula, the hyperbolic cosine of the input argument.
+ *
+ * \return
+ * - __nv_fp128_cosh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - __nv_fp128_cosh(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_cosh(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_cosh(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \tanh{x} \end_cuda_math_formula, the hyperbolic tangent of the input argument.
+ *
+ * \return
+ * - __nv_fp128_tanh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_tanh( 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula.
+ * - __nv_fp128_tanh(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_tanh(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \sinh^{-1}{x} \end_cuda_math_formula, the inverse hyperbolic sine of the input argument.
+ *
+ * \return
+ * - __nv_fp128_asinh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_asinh(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula. 
+ * - __nv_fp128_asinh(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_asinh(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \cosh^{-1}{x} \end_cuda_math_formula, the nonnegative inverse hyperbolic cosine of the input argument.
+ *
+ * \return 
+ * Result will be in the interval [0, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ].
+ * - __nv_fp128_acosh(1) returns 0.
+ * - __nv_fp128_acosh(\p x) returns NaN for \p x in the interval [
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , 1).
+ * - __nv_fp128_acosh( 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_acosh(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_acosh(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \tanh^{-1}{x} \end_cuda_math_formula, the inverse hyperbolic tangent of the input argument.
+ *
+ * \return 
+ * - __nv_fp128_atanh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_atanh(
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_atanh(\p x) returns NaN for \p x outside interval [-1, 1].
+ * - __nv_fp128_atanh(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_atanh(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Truncate input argument to the integral part.
+ *
+ * \return 
+ * Rounded \p x to the nearest integer value in floating-point format, that does not exceed \p x in 
+ * magnitude.
+ * - __nv_fp128_trunc(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_trunc(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_trunc(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_trunc(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \lfloor x \rfloor \end_cuda_math_formula, the largest integer less than or equal to \p x.
+ * 
+ * \return
+ * \cuda_math_formula \lfloor x \rfloor \end_cuda_math_formula
+ *  expressed as a floating-point number.
+ * - __nv_fp128_floor(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_floor(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_floor(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_floor(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \lceil x \rceil \end_cuda_math_formula, the smallest integer greater than or equal to \p x.
+ * 
+ * \return
+ * \cuda_math_formula \lceil x \rceil \end_cuda_math_formula
+ *  expressed as a floating-point number.
+ * - __nv_fp128_ceil(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_ceil(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_ceil(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_ceil(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Round to nearest integer value in floating-point format,
+ * with halfway cases rounded away from zero.
+ *
+ * \return 
+ * - __nv_fp128_round(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_round(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_round(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_round(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Round to nearest integer value in floating-point format,
+ * with halfway cases rounded to the nearest even integer value.
+ *
+ * \return 
+ * - __nv_fp128_rint(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_rint(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_rint(NaN) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_rint(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula |x| \end_cuda_math_formula, the absolute value of the input argument.
+ *
+ * \return
+ * - __nv_fp128_fabs(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_fabs(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns +0.
+ * - __nv_fp128_fabs(NaN) returns an unspecified NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_fabs(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Create value with the magnitude of the first agument \p x, and the sign of the second argument \p y.
+ *
+ * \return
+ * - copysign(\p NaN, \p y) returns a \p NaN with the sign of \p y.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_copysign(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Determine the maximum numeric value of the arguments.
+ *
+ * \return
+ * The maximum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_fmax(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Determine the minimum numeric value of the arguments.
+ *
+ * \return
+ * The minimum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_fmin(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute the positive difference between \p x and \p y.
+ *
+ * \return
+ * - __nv_fp128_fdim(\p x, \p y) returns \p x - \p y if \cuda_math_formula x > y \end_cuda_math_formula.
+ * - __nv_fp128_fdim(\p x, \p y) returns +0 if \cuda_math_formula x \leq y \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_fdim(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate the floating-point remainder of \p x / \p y.
+ *
+ * \return
+ * The floating-point remainder of the division operation \p x / \p y calculated
+ * by this function is exactly the value <tt>x - n*y</tt>, where \p n is \p x / \p y with its fractional part truncated.
+ * - The computed value will have the same sign as \p x, and its magnitude will be less than the magnitude of \p y.
+ * - __nv_fp128_fmod(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  if \p y is not zero.
+ * - __nv_fp128_fmod(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns \p x if \p x is finite.
+ * - __nv_fp128_fmod(\p x, \p y) returns NaN if \p x is 
+ * \cuda_math_formula \pm\infty \end_cuda_math_formula
+ *  or \p y is zero.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_fmod(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute the floating-point remainder function.
+ *
+ * \return 
+ * The floating-point remainder \p r of dividing 
+ * \p x by \p y for nonzero \p y is defined as 
+ * \cuda_math_formula r = x - n y \end_cuda_math_formula.
+ * The value \p n is the integer value nearest 
+ * \cuda_math_formula \frac{x}{y} \end_cuda_math_formula. 
+ * In the halfway cases when 
+ * \cuda_math_formula | n -\frac{x}{y} | = \frac{1}{2} \end_cuda_math_formula
+ * , the
+ * even \p n value is chosen.
+ * - __nv_fp128_remainder(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns NaN.
+ * - __nv_fp128_remainder(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p y) returns NaN.
+ * - __nv_fp128_remainder(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns \p x for finite \p x.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_remainder(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Extract mantissa and exponent of the floating-point input argument.
+ * 
+ * Decompose the floating-point value \p x into a component \p m for the 
+ * normalized fraction element and an integral term \p n for the exponent.
+ * The absolute value of \p m will be greater than or equal to 0.5 and 
+ * less than 1.0 or it will be equal to 0; 
+ * \cuda_math_formula x = m\cdot 2^n \end_cuda_math_formula.
+ * The integer exponent \p n will be stored in the location to which \p nptr points.
+ *
+ * \return
+ * The fractional component \p m.
+ * - __nv_fp128_frexp(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p nptr) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  and stores zero in the location pointed to by \p nptr.
+ * - __nv_fp128_frexp(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p nptr) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *  and stores an unspecified value in the 
+ * location to which \p nptr points.
+ * - __nv_fp128_frexp(NaN, \p y) returns a NaN and stores an unspecified value in the location to which \p nptr points.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_frexp(__float128 x, int* nptr) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Break down the input argument into fractional and integral parts.
+ *
+ * Break down the argument \p x into fractional and integral parts. The 
+ * integral part is stored in floating-point format in the location to which \p iptr points.
+ * Fractional and integral parts are given the same sign as the argument \p x.
+ *
+ * \return 
+ * - __nv_fp128_modf(
+ * \cuda_math_formula \pm x \end_cuda_math_formula
+ * , \p iptr) returns a result with the same sign as \p x.
+ * - __nv_fp128_modf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p iptr) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  and stores 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *   in the object pointed to by \p iptr.
+ * - __nv_fp128_modf(NaN, \p iptr) stores a NaN in the object pointed to by \p iptr and returns a NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_modf(__float128 x, __float128* iptr) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate \cuda_math_formula \sqrt{x^2+y^2} \end_cuda_math_formula, the square root of the sum of squares of two arguments.
+ *
+ * \return
+ * The length of the hypotenuse of a right triangle whose two sides have lengths 
+ * \cuda_math_formula |x| \end_cuda_math_formula and \cuda_math_formula |y| \end_cuda_math_formula without undue overflow or underflow.
+ * - __nv_fp128_hypot(\p x,\p y), __nv_fp128_hypot(\p y,\p x), and __nv_fp128_hypot(\p x, \p -y) are equivalent.
+ * - __nv_fp128_hypot(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) is equivalent to __nv_fp128_fabs(\p x).
+ * - __nv_fp128_hypot(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,\p y) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula,
+ * even if \p y is a NaN.
+ * - __nv_fp128_hypot(NaN, \p y) returns NaN, when \p y is not \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_hypot(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ * as a single operation using round-to-nearest-even rounding mode.
+ *
+ * \return
+ * The value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ * as a single ternary operation, rounded once using round-to-nearest,
+ * ties-to-even rounding mode.
+ * - __nv_fp128_fma(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __nv_fp128_fma(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __nv_fp128_fma(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __nv_fp128_fma(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __nv_fp128_fma(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_fma(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_fma(\p x, \p y, \p z) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_fma(__float128 x, __float128 y, __float128 c) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Calculate the value of 
+ * \cuda_math_formula x\cdot 2^{exp} \end_cuda_math_formula.
+ *
+ * \return
+ * - __nv_fp128_ldexp(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p exp) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_ldexp(\p x, 0) returns \p x.
+ * - __nv_fp128_ldexp(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p exp) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - __nv_fp128_ldexp(NaN, \p exp) returns NaN.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_ldexp(__float128 x, int exp) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute the unbiased integer exponent of the input argument.
+ *
+ * \return
+ * - If successful, returns the unbiased exponent of the argument.
+ * - __nv_fp128_ilogb(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns <tt>INT_MIN</tt>.
+ * - __nv_fp128_ilogb(NaN) returns <tt>INT_MIN</tt>.
+ * - __nv_fp128_ilogb(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns <tt>INT_MAX</tt>.
+ * - Note: above behavior does not take into account <tt>FP_ILOGB0</tt> nor <tt>FP_ILOGBNAN</tt>.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ int __nv_fp128_ilogb(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute \cuda_math_formula x \cdot y \end_cuda_math_formula, the product of the two floating-point inputs using round-to-nearest-even rounding mode.
+ *
+ * \return Returns \p x * \p y.
+ * - sign of the product \p x * \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __nv_fp128_mul(\p x, \p y) is equivalent to __nv_fp128_mul(\p y, \p x).
+ * - __nv_fp128_mul(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __nv_fp128_mul(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __nv_fp128_mul(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_mul(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute \cuda_math_formula x + y \end_cuda_math_formula, the sum of the two floating-point inputs using round-to-nearest-even rounding mode.
+ *
+ * \return Returns \p x + \p y.
+ * - __nv_fp128_add(\p x, \p y) is equivalent to __nv_fp128_add(\p y, \p x).
+ * - __nv_fp128_add(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p x.
+ * - __nv_fp128_add(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __nv_fp128_add(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns NaN.
+ * - __nv_fp128_add(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_add(\p x, \p -x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_add(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute \cuda_math_formula x - y \end_cuda_math_formula, the difference of the two floating-point inputs using round-to-nearest-even rounding mode.
+ *
+ * \return Returns \p x - \p y.
+ * - __nv_fp128_sub(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p y.
+ * - __nv_fp128_sub(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \mp\infty \end_cuda_math_formula for finite \p x.
+ * - __nv_fp128_sub(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __nv_fp128_sub(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __nv_fp128_sub(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __nv_fp128_sub(\p x, \p x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_sub(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Compute \cuda_math_formula \frac{x}{y} \end_cuda_math_formula, the quotient of the two floating-point inputs using round-to-nearest-even rounding mode.
+ *
+ * \return
+ * - sign of the quotient \p x / \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __nv_fp128_div(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns NaN.
+ * - __nv_fp128_div(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __nv_fp128_div(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p x.
+ * - __nv_fp128_div(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - __nv_fp128_div(\p x, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __nv_fp128_div(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for \p y \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_quad
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ __float128 __nv_fp128_div(__float128 x, __float128 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Determine whether the input argument is a NaN.
+ *
+ * \return
+ * A nonzero value if and only if \p x is a NaN value.
+ *
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ int __nv_fp128_isnan(__float128 x) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_QUAD
+ * \brief Determine whether the pair of inputs is unordered.
+ *
+ * \return
+ * - nonzero value if at least one of input values is a NaN.
+ * - zero otherwise
+ *
+ * \note_fp128_target_arch
+ */
+__DEVICE_FP128_FUNCTIONS_DECL__ int __nv_fp128_isunordered(__float128 x, __float128 y) __DEF_IF_HOST
+#endif /* __FLOAT128_CPP_SPELLING_ENABLED__ */
+
+
+#ifdef __FLOAT128_C_SPELLING_ENABLED__
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_sqrt(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_sin(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_cos(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_tan(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_asin(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_acos(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_atan(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_exp(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_exp2(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_exp10(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_expm1(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_log(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_log2(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_log10(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_log1p(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_pow(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_sinh(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_cosh(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_tanh(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_asinh(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_acosh(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_atanh(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_trunc(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_floor(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_ceil(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_round(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_rint(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_fabs(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_copysign(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_fmax(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_fmin(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_fdim(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_fmod(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_remainder(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_frexp(_Float128 x, int* nptr) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_modf(_Float128 x, _Float128* iptr) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_hypot(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_fma(_Float128 x, _Float128 y, _Float128 c) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_ldexp(_Float128 x, int exp) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ int __nv_fp128_ilogb(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_mul(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_add(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_sub(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ _Float128 __nv_fp128_div(_Float128 x, _Float128 y) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ int __nv_fp128_isnan(_Float128 x) __DEF_IF_HOST
+__DEVICE_FP128_FUNCTIONS_DECL__ int __nv_fp128_isunordered(_Float128 x, _Float128 y) __DEF_IF_HOST
+#endif /* __FLOAT_C_SPELLING_ENABLED */
+
+
+#undef __DEVICE_FP128_FUNCTIONS_DECL__
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__DEVICE_FP128_FUNCTIONS_H__ */
+
+#endif /* !__NV_DISABLE_DEVICE_FP128_FUNCTIONS__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_functions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae9de40d680c6e50c25b0c4a01c00679bd0c8fe4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/device_functions.h
@@ -0,0 +1,2993 @@
+/*
+ * Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/device_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/device_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H__
+#endif
+
+#if !defined(__DEVICE_FUNCTIONS_H__)
+#define __DEVICE_FUNCTIONS_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_FUNCTIONS_DECL__ __device__ __cudart_builtin__
+#define __DEVICE_FUNCTIONS_STATIC_DECL__ __device__ __cudart_builtin__
+#define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ __device__ __host__ __cudart_builtin__
+#else
+#define __DEVICE_FUNCTIONS_DECL__ __device__ __cudart_builtin__
+#define __DEVICE_FUNCTIONS_STATIC_DECL__ static __inline__ __device__ __cudart_builtin__
+#define __DEVICE_HOST_FUNCTIONS_STATIC_DECL__ static __inline__ __device__ __host__ __cudart_builtin__
+#endif /* __CUDACC_RTC__ */
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+extern "C"
+{
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the most significant 32 bits of the product of the two 32-bit integers.
+ *
+ * Calculate the most significant 32 bits of the 64-bit product \p x * \p y, where \p x and \p y
+ * are 32-bit integers.
+ *
+ * \return Returns the most significant 32 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __mulhi(int x, int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the most significant 32 bits of the product of the two 32-bit unsigned integers.
+ *
+ * Calculate the most significant 32 bits of the 64-bit product \p x * \p y, where \p x and \p y
+ * are 32-bit unsigned integers. 
+ *
+ * \return Returns the most significant 32 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __umulhi(unsigned int x, unsigned int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the most significant 64 bits of the product of the two 64-bit integers.
+ *
+ * Calculate the most significant 64 bits of the 128-bit product \p x * \p y, where \p x and \p y
+ * are 64-bit integers. 
+ *
+ * \return Returns the most significant 64 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __mul64hi(long long int x, long long int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the most significant 64 bits of the product of the two 64 unsigned bit integers.
+ *
+ * Calculate the most significant 64 bits of the 128-bit product \p x * \p y, where \p x and \p y
+ * are 64-bit unsigned integers. 
+ *
+ * \return Returns the most significant 64 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in an integer as a float.
+ *
+ * Reinterpret the bits in the signed integer value \p x as a single-precision
+ * floating-point value.
+ * \return Returns reinterpreted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __int_as_float(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a float as a signed integer.
+ *
+ * Reinterpret the bits in the single-precision floating-point value \p x
+ * as a signed integer.
+ * \return Returns reinterpreted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __float_as_int(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in an unsigned integer as a float.
+ *
+ * Reinterpret the bits in the unsigned integer value \p x as a single-precision
+ * floating-point value.
+ * \return Returns reinterpreted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __uint_as_float(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a float as a unsigned integer.
+ *
+ * Reinterpret the bits in the single-precision floating-point value \p x
+ * as a unsigned integer.
+ * \return Returns reinterpreted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __float_as_uint(float x);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   __syncthreads(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   __prof_trigger(int);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   __threadfence(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   __threadfence_block(void);
+__DEVICE_FUNCTIONS_DECL__ 
+#if defined(__GNUC__) || defined(__CUDACC_RTC__)
+__attribute__((__noreturn__))
+#elif defined(_MSC_VER)
+__declspec(noreturn)
+#endif  /* defined(__GNUC__) || defined(__CUDACC_RTC__) */
+__device_builtin__ void                   __trap(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   __brkpt();
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Clamp the input argument to [+0.0, 1.0].
+ *
+ * Clamp the input argument \p x to be within the interval [+0.0, 1.0].
+ * \return 
+ * - __saturatef(\p x) returns +0 if \cuda_math_formula x \le 0 \end_cuda_math_formula.
+ * - __saturatef(\p x) returns 1 if \cuda_math_formula x \ge 1 \end_cuda_math_formula.
+ * - __saturatef(\p x) returns \p x if \cuda_math_formula 0 < x < 1 \end_cuda_math_formula.
+ * - __saturatef(NaN) returns +0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __saturatef(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate 
+ * \cuda_math_formula |x - y| + z \end_cuda_math_formula
+ * , the sum of absolute difference.
+ *
+ * Calculate 
+ * \cuda_math_formula |x - y| + z \end_cuda_math_formula
+ * , the 32-bit sum of the third argument \p z plus and the absolute 
+ * value of the difference between the first argument, \p x, and second 
+ * argument, \p y.
+ * 
+ * Inputs \p x and \p y are signed 32-bit integers, input \p z is 
+ * a 32-bit unsigned integer.
+ *
+ * \return Returns 
+ * \cuda_math_formula |x - y| + z \end_cuda_math_formula.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __sad(int x, int y, unsigned int z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate 
+ * \cuda_math_formula |x - y| + z \end_cuda_math_formula
+ * , the sum of absolute difference.
+ *
+ * Calculate 
+ * \cuda_math_formula |x - y| + z \end_cuda_math_formula
+ * , the 32-bit sum of the third argument \p z plus and the absolute 
+ * value of the difference between the first argument, \p x, and second 
+ * argument, \p y.
+ * 
+ * Inputs \p x, \p y, and \p z are unsigned 32-bit integers.
+ * 
+ * \return Returns 
+ * \cuda_math_formula |x - y| + z \end_cuda_math_formula.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __usad(unsigned int x, unsigned int y, unsigned int z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the least significant 32 bits of the product of the least significant 24 bits of two integers.
+ *
+ * Calculate the least significant 32 bits of the product of the least significant 24 bits of \p x and \p y.
+ * The high order 8 bits of \p x and \p y are ignored.
+ *
+ * \return Returns the least significant 32 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __mul24(int x, int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Calculate the least significant 32 bits of the product of the least significant 24 bits of two unsigned integers.
+ *
+ * Calculate the least significant 32 bits of the product of the least significant 24 bits of \p x and \p y.
+ * The high order 8 bits of  \p x and  \p y are ignored. 
+ *
+ * \return Returns the least significant 32 bits of the product \p x * \p y.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __umul24(unsigned int x, unsigned int y);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Divide two floating-point values.
+ *
+ * Compute \p x divided by \p y.
+ *
+ * \return Returns \p x / \p y.
+ * - Follows the regular division operation behavior by default.
+ * - If \p -use_fast_math is specified and is not amended by
+ * an explicit \p -prec_div=true, uses ::__fdividef() for higher
+ * performance
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fdividef(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate division of the input arguments.
+ *
+ * Calculate the fast approximate division of \p x by \p y.
+ *
+ * \return Returns \p x / \p y.
+ * - __fdividef(
+ * \cuda_math_formula \infty \end_cuda_math_formula
+ * , \p y) returns NaN for 
+ * \cuda_math_formula 2^{126} < |y| < 2^{128} \end_cuda_math_formula.
+ * - __fdividef(\p x, \p y) returns 0 for 
+ * \cuda_math_formula 2^{126} < |y| < 2^{128} \end_cuda_math_formula
+ *  and finite
+ * \cuda_math_formula x \end_cuda_math_formula.
+ * \see __fdiv_rn() for further special case behavior specification.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fdividef(float x, float y);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 fdivide(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate sine of the input argument.
+ *
+ * Calculate the fast approximate sine of the input argument \p x, measured in radians.
+ *
+ * \return Returns the approximate sine of \p x.
+ *
+ * \see sinf() for further special case behavior specification.
+ * \note_accuracy_single_intrinsic
+ * \note Output in the denormal range is flushed to sign preserving 0.0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __sinf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate cosine of the input argument.
+ *
+ * Calculate the fast approximate cosine of the input argument \p x, measured in radians.
+ *
+ * \return Returns the approximate cosine of \p x.
+ *
+ * \see cosf() for further special case behavior specification.
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __cosf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate tangent of the input argument.
+ *
+ * Calculate the fast approximate tangent of the input argument \p x, measured in radians.
+ *
+ * \return Returns the approximate tangent of \p x.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note The result is computed as the fast divide of ::__sinf()
+ * by ::__cosf(). Denormal output is flushed to sign-preserving 0.0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __tanf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate hyperbolic tangent of the input argument.
+ *
+ * Calculate the fast approximate hyperbolic tangent of the input argument \p x, measured in radians.
+ *
+ * \return Returns the approximate hyperbolic tangent of \p x.
+ *
+ * \see tanhf() for further special case behavior specification.
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __tanhf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate of sine and cosine of the first input argument.
+ *
+ * Calculate the fast approximate of sine and cosine of the first input argument \p x (measured
+ * in radians). The results for sine and cosine are written into the second 
+ * argument, \p sptr, and, respectively, third argument, \p cptr.
+ *
+ * \see __sinf() and __cosf().
+ * \note_accuracy_single_intrinsic
+ * \note Denorm input/output is flushed to sign preserving 0.0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ void                   __sincosf(float x, float *sptr, float *cptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument.
+ *
+ * Calculate the fast approximate base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument \p x, 
+ * \cuda_math_formula e^x \end_cuda_math_formula.
+ *
+ * \return Returns an approximation to 
+ * \cuda_math_formula e^x \end_cuda_math_formula.
+ * \see expf() for further special case behavior specification.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __expf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate base 10 exponential of the input argument.
+ *
+ * Calculate the fast approximate base 10 exponential of the input argument \p x, 
+ * \cuda_math_formula 10^x \end_cuda_math_formula.
+ *
+ * \return Returns an approximation to 
+ * \cuda_math_formula 10^x \end_cuda_math_formula.
+ * \see exp10f() for further special case behavior specification.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __exp10f(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate base 2 logarithm of the input argument.
+ *
+ * Calculate the fast approximate base 2 logarithm of the input argument \p x.
+ *
+ * \return Returns an approximation to 
+ * \cuda_math_formula \log_2(x) \end_cuda_math_formula.
+ * \see log2f() for further special case behavior specification.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __log2f(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate base 10 logarithm of the input argument.
+ *
+ * Calculate the fast approximate base 10 logarithm of the input argument \p x.
+ *
+ * \return Returns an approximation to 
+ * \cuda_math_formula \log_{10}(x) \end_cuda_math_formula.
+ * \see log10f() for further special case behavior specification.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __log10f(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  logarithm of the input argument.
+ *
+ * Calculate the fast approximate base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  logarithm of the input argument \p x.
+ *
+ * \return Returns an approximation to 
+ * \cuda_math_formula \log_e(x) \end_cuda_math_formula.
+ * \see logf() for further special case behavior specification.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __logf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Calculate the fast approximate of 
+ * \cuda_math_formula x^y \end_cuda_math_formula.
+ *
+ * Calculate the fast approximate of \p x, the first input argument, 
+ * raised to the power of \p y, the second input argument, 
+ * \cuda_math_formula x^y \end_cuda_math_formula.
+ *
+ * \return Returns an approximation to 
+ * \cuda_math_formula x^y \end_cuda_math_formula.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ float                  __powf(float x, float y) __THROW;
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed integer in round-to-nearest-even mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed integer
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __float2int_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed integer in round-towards-zero mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed integer
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __float2int_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed integer in round-up mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed integer
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __float2int_ru(float);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed integer in round-down mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed integer
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __float2int_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned integer in round-to-nearest-even mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned integer
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __float2uint_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned integer in round-towards-zero mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned integer
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __float2uint_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned integer in round-up mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned integer
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __float2uint_ru(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned integer in round-down mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned integer
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __float2uint_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-to-nearest-even mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __int2float_rn(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-towards-zero mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __int2float_rz(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-up mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __int2float_ru(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-down mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __int2float_rd(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-to-nearest-even mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __uint2float_rn(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-towards-zero mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __uint2float_rz(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-up mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __uint2float_ru(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-down mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __uint2float_rd(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed 64-bit integer in round-to-nearest-even mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed 64-bit integer
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __float2ll_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed 64-bit integer in round-towards-zero mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed 64-bit integer
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __float2ll_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed 64-bit integer in round-up mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed 64-bit integer
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __float2ll_ru(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to a signed 64-bit integer in round-down mode.
+ *
+ * Convert the single-precision floating-point value \p x to a signed 64-bit integer
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __float2ll_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned 64-bit integer in round-to-nearest-even mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned 64-bit integer
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __float2ull_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned 64-bit integer in round-towards-zero mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned 64-bit integer
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __float2ull_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned 64-bit integer in round-up mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned 64-bit integer
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __float2ull_ru(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a float to an unsigned 64-bit integer in round-down mode.
+ *
+ * Convert the single-precision floating-point value \p x to an unsigned 64-bit integer
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __float2ull_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit integer to a float in round-to-nearest-even mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a single-precision floating-point value
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ll2float_rn(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-towards-zero mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ll2float_rz(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-up mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ll2float_ru(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed integer to a float in round-down mode.
+ *
+ * Convert the signed integer value \p x to a single-precision floating-point value
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ll2float_rd(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-to-nearest-even mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ull2float_rn(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-towards-zero mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ull2float_rz(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-up mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ull2float_ru(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned integer to a float in round-down mode.
+ *
+ * Convert the unsigned integer value \p x to a single-precision floating-point value
+ * in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __ull2float_rd(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Add two floating-point values in round-to-nearest-even mode.
+ * 
+ * Compute the sum of \p x and \p y in round-to-nearest-even rounding mode.
+ *
+ * \return Returns \p x + \p y.
+ * - __fadd_rn(\p x, \p y) is equivalent to __fadd_rn(\p y, \p x).
+ * - __fadd_rn(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p x.
+ * - __fadd_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fadd_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns NaN.
+ * - __fadd_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fadd_rn(\p x, \p -x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fadd_rn(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Add two floating-point values in round-towards-zero mode.
+ * 
+ * Compute the sum of \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x + \p y.
+ * - __fadd_rz(\p x, \p y) is equivalent to __fadd_rz(\p y, \p x).
+ * - __fadd_rz(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p x.
+ * - __fadd_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fadd_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns NaN.
+ * - __fadd_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fadd_rz(\p x, \p -x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fadd_rz(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Add two floating-point values in round-up mode.
+ * 
+ * Compute the sum of \p x and \p y in round-up (to positive infinity) mode.
+ *
+ * \return Returns \p x + \p y.
+ * - __fadd_ru(\p x, \p y) is equivalent to __fadd_ru(\p y, \p x).
+ * - __fadd_ru(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p x.
+ * - __fadd_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fadd_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns NaN.
+ * - __fadd_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fadd_ru(\p x, \p -x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fadd_ru(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Add two floating-point values in round-down mode.
+ * 
+ * Compute the sum of \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x + \p y.
+ * - __fadd_rd(\p x, \p y) is equivalent to __fadd_rd(\p y, \p x).
+ * - __fadd_rd(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p x.
+ * - __fadd_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fadd_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns NaN.
+ * - __fadd_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fadd_rd(\p x, \p -x) returns \cuda_math_formula -0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fadd_rd(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Subtract two floating-point values in round-to-nearest-even mode.
+ * 
+ * Compute the difference of \p x and \p y in round-to-nearest-even rounding mode.
+ *
+ * \return Returns \p x - \p y.
+ * - __fsub_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p y.
+ * - __fsub_rn(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \mp\infty \end_cuda_math_formula for finite \p x.
+ * - __fsub_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fsub_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fsub_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsub_rn(\p x, \p x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsub_rn(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Subtract two floating-point values in round-towards-zero mode.
+ * 
+ * Compute the difference of \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x - \p y.
+ * - __fsub_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p y.
+ * - __fsub_rz(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \mp\infty \end_cuda_math_formula for finite \p x.
+ * - __fsub_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fsub_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fsub_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsub_rz(\p x, \p x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsub_rz(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Subtract two floating-point values in round-up mode.
+ * 
+ * Compute the difference of \p x and \p y in round-up (to positive infinity) mode.
+ *
+ * \return Returns \p x - \p y.
+ * - __fsub_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p y.
+ * - __fsub_ru(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \mp\infty \end_cuda_math_formula for finite \p x.
+ * - __fsub_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fsub_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fsub_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsub_ru(\p x, \p x) returns \cuda_math_formula +0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsub_ru(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Subtract two floating-point values in round-down mode.
+ * 
+ * Compute the difference of \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x - \p y.
+ * - __fsub_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \pm\infty \end_cuda_math_formula for finite \p y.
+ * - __fsub_rd(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \mp\infty \end_cuda_math_formula for finite \p x.
+ * - __fsub_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fsub_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \mp\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __fsub_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsub_rd(\p x, \p x) returns \cuda_math_formula -0 \end_cuda_math_formula for finite \p x, including \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsub_rd(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Multiply two floating-point values in round-to-nearest-even mode.
+ * 
+ * Compute the product of \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x * \p y.
+ * - sign of the product \p x * \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fmul_rn(\p x, \p y) is equivalent to __fmul_rn(\p y, \p x).
+ * - __fmul_rn(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fmul_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fmul_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmul_rn(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Multiply two floating-point values in round-towards-zero mode.
+ * 
+ * Compute the product of \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x * \p y.
+ * - sign of the product \p x * \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fmul_rz(\p x, \p y) is equivalent to __fmul_rz(\p y, \p x).
+ * - __fmul_rz(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fmul_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fmul_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmul_rz(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Multiply two floating-point values in round-up mode.
+ * 
+ * Compute the product of \p x and \p y in round-up (to positive infinity) mode.
+ *
+ * \return Returns \p x * \p y.
+ * - sign of the product \p x * \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fmul_ru(\p x, \p y) is equivalent to __fmul_ru(\p y, \p x).
+ * - __fmul_ru(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fmul_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fmul_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmul_ru(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Multiply two floating-point values in round-down mode.
+ * 
+ * Compute the product of \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x * \p y.
+ * - sign of the product \p x * \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fmul_rd(\p x, \p y) is equivalent to __fmul_rd(\p y, \p x).
+ * - __fmul_rd(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fmul_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fmul_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ * \note_nofma
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmul_rd(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation, in round-to-nearest-even mode.
+ * 
+ * Computes the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation, rounding the
+ * result once in round-to-nearest-even mode.
+ *
+ * \return Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - __fmaf_rn(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_rn(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_rn(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fmaf_rn(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __fmaf_rn(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_rn(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_rn(\p x, \p y, \p z) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmaf_rn(float x, float y, float z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation, in round-towards-zero mode.
+ * 
+ * Computes the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation, rounding the
+ * result once in round-towards-zero mode.
+ *
+ * \return Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - __fmaf_rz(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_rz(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_rz(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fmaf_rz(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __fmaf_rz(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_rz(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_rz(\p x, \p y, \p z) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmaf_rz(float x, float y, float z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation, in round-up mode.
+ * 
+ * Computes the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation, rounding the
+ * result once in round-up (to positive infinity) mode.
+ *
+ * \return Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - __fmaf_ru(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_ru(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_ru(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fmaf_ru(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __fmaf_ru(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_ru(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_ru(\p x, \p y, \p z) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmaf_ru(float x, float y, float z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation, in round-down mode.
+ * 
+ * Computes the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation, rounding the
+ * result once in round-down (to negative infinity) mode.
+ *
+ * \return Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - __fmaf_rd(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_rd(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - __fmaf_rd(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fmaf_rd(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - __fmaf_rd(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_rd(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula -0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fmaf_rd(\p x, \p y, \p z) returns \cuda_math_formula -0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fmaf_rd(float x, float y, float z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula
+ *  in round-to-nearest-even mode.
+ * 
+ * Compute the reciprocal of \p x in round-to-nearest-even mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula.
+ * - __frcp_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __frcp_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __frcp_rn(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __frcp_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula
+ *  in round-towards-zero mode.
+ * 
+ * Compute the reciprocal of \p x in round-towards-zero mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula.
+ * - __frcp_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __frcp_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __frcp_rz(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __frcp_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula
+ *  in round-up mode.
+ * 
+ * Compute the reciprocal of \p x in round-up (to positive infinity) mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula.
+ * - __frcp_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __frcp_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __frcp_ru(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __frcp_ru(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula
+ *  in round-down mode.
+ * 
+ * Compute the reciprocal of \p x in round-down (to negative infinity) mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \frac{1}{x} \end_cuda_math_formula.
+ * - __frcp_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __frcp_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __frcp_rd(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __frcp_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula
+ *  in round-to-nearest-even mode.
+ * 
+ * Compute the square root of \p x in round-to-nearest-even mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ * - __fsqrt_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsqrt_rn(\cuda_math_formula +\infty \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fsqrt_rn(\p x) returns NaN for \p x < 0.
+ * - __fsqrt_rn(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsqrt_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula
+ *  in round-towards-zero mode.
+ * 
+ * Compute the square root of \p x in round-towards-zero mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ * - __fsqrt_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsqrt_rz(\cuda_math_formula +\infty \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fsqrt_rz(\p x) returns NaN for \p x < 0.
+ * - __fsqrt_rz(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsqrt_rz(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula
+ *  in round-up mode.
+ * 
+ * Compute the square root of \p x in round-up (to positive infinity) mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ * - __fsqrt_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsqrt_ru(\cuda_math_formula +\infty \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fsqrt_ru(\p x) returns NaN for \p x < 0.
+ * - __fsqrt_ru(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsqrt_ru(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula
+ *  in round-down mode.
+ * 
+ * Compute the square root of \p x in round-down (to negative infinity) mode.
+ *
+ * \return Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ * - __fsqrt_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - __fsqrt_rd(\cuda_math_formula +\infty \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - __fsqrt_rd(\p x) returns NaN for \p x < 0.
+ * - __fsqrt_rd(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fsqrt_rd(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute
+ * \cuda_math_formula 1/\sqrt{x} \end_cuda_math_formula
+ *  in round-to-nearest-even mode.
+ * 
+ * Compute the reciprocal square root of \p x in round-to-nearest-even mode.
+ *
+ * \return Returns
+ * \cuda_math_formula 1/\sqrt{x} \end_cuda_math_formula.
+ * - __frsqrt_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - __frsqrt_rn(\cuda_math_formula +\infty \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula.
+ * - __frsqrt_rn(\p x) returns NaN for \p x < 0.
+ * - __frsqrt_rn(NaN) returns NaN.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __frsqrt_rn(float x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Divide two floating-point values in round-to-nearest-even mode.
+ *
+ * Divide two floating-point values \p x by \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x / \p y.
+ * - sign of the quotient \p x / \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fdiv_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns NaN.
+ * - __fdiv_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fdiv_rn(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p x.
+ * - __fdiv_rn(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - __fdiv_rn(\p x, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fdiv_rn(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for \p y \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fdiv_rn(float x, float y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Divide two floating-point values in round-towards-zero mode.
+ *
+ * Divide two floating-point values \p x by \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x / \p y.
+ * - sign of the quotient \p x / \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fdiv_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns NaN.
+ * - __fdiv_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fdiv_rz(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p x.
+ * - __fdiv_rz(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - __fdiv_rz(\p x, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fdiv_rz(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for \p y \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fdiv_rz(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Divide two floating-point values in round-up mode.
+ * 
+ * Divide two floating-point values \p x by \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x / \p y.
+ * - sign of the quotient \p x / \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fdiv_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns NaN.
+ * - __fdiv_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fdiv_ru(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p x.
+ * - __fdiv_ru(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - __fdiv_ru(\p x, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fdiv_ru(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for \p y \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fdiv_ru(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Divide two floating-point values in round-down mode.
+ *
+ * Divide two floating-point values \p x by \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x / \p y.
+ * - sign of the quotient \p x / \p y is XOR of the signs of \p x and \p y when neither inputs nor result are NaN.
+ * - __fdiv_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns NaN.
+ * - __fdiv_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns NaN.
+ * - __fdiv_rd(\p x, \cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for finite \p x.
+ * - __fdiv_rd(\cuda_math_formula \pm\infty \end_cuda_math_formula, \p y) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for finite \p y.
+ * - __fdiv_rd(\p x, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \infty \end_cuda_math_formula of appropriate sign for \p x \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - __fdiv_rd(\cuda_math_formula \pm 0 \end_cuda_math_formula, \p y) returns \cuda_math_formula 0 \end_cuda_math_formula of appropriate sign for \p y \cuda_math_formula \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single_intrinsic
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  __fdiv_rd(float x, float y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Return the number of consecutive high-order zero bits in a 32-bit integer.
+ *
+ * Count the number of consecutive leading zero bits, starting at the most significant bit (bit 31) of \p x.
+ *
+ * \return Returns a value between 0 and 32 inclusive representing the number of zero bits.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __clz(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Find the position of the least significant bit set to 1 in a 32-bit integer.
+ *
+ * Find the position of the first (least significant) bit set to 1 in \p x, where the least significant
+ * bit position is 1. 
+ *
+ * \return Returns a value between 0 and 32 inclusive representing the position of the first bit set.
+ * - __ffs(0) returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __ffs(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Count the number of bits that are set to 1 in a 32-bit integer.
+ *
+ * Count the number of bits that are set to 1 in \p x.
+ *
+ * \return Returns a value between 0 and 32 inclusive representing the number of set bits.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __popc(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Reverse the bit order of a 32-bit unsigned integer.
+ *
+ * Reverses the bit order of the 32-bit unsigned integer \p x.
+ *
+ * \return Returns the bit-reversed value of \p x. i.e. bit N of the return value corresponds to bit 31-N of \p x.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __brev(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Count the number of consecutive high-order zero bits in a 64-bit integer.
+ *
+ * Count the number of consecutive leading zero bits, starting at the most significant bit (bit 63) of \p x.
+ *
+ * \return Returns a value between 0 and 64 inclusive representing the number of zero bits.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __clzll(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Find the position of the least significant bit set to 1 in a 64-bit integer.
+ *
+ * Find the position of the first (least significant) bit set to 1 in \p x, where the least significant
+ * bit position is 1. 
+ *
+ * \return Returns a value between 0 and 64 inclusive representing the position of the first bit set.
+ * - __ffsll(0) returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __ffsll(long long int x);
+
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Count the number of bits that are set to 1 in a 64-bit integer.
+ *
+ * Count the number of bits that are set to 1 in \p x.
+ *
+ * \return Returns a value between 0 and 64 inclusive representing the number of set bits.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __popcll(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Reverse the bit order of a 64-bit unsigned integer.
+ *
+ * Reverses the bit order of the 64-bit unsigned integer \p x.
+ *
+ * \return Returns the bit-reversed value of \p x. i.e. bit N of the return value corresponds to bit 63-N of \p x.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __brevll(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Return selected bytes from two 32-bit unsigned integers.
+ *
+ * \return Returns a 32-bit integer consisting of four bytes from eight input bytes provided in the two
+ * input integers \p x and \p y, as specified by a selector, \p s.
+ *
+ * Create 8-byte source
+ * - uint64_t \p tmp64 = ((uint64_t)\p y << 32) | \p x;
+ *
+ * Extract selector bits
+ * - \p selector0 = (\p s >>  0) & 0x7;
+ * - \p selector1 = (\p s >>  4) & 0x7;
+ * - \p selector2 = (\p s >>  8) & 0x7;
+ * - \p selector3 = (\p s >> 12) & 0x7;
+ *
+ * Return 4 selected bytes from 8-byte source:
+ * - \p res[07:00] = \p tmp64[\p selector0];
+ * - \p res[15:08] = \p tmp64[\p selector1];
+ * - \p res[23:16] = \p tmp64[\p selector2];
+ * - \p res[31:24] = \p tmp64[\p selector3];
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __byte_perm(unsigned int x, unsigned int y, unsigned int s);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Compute average of signed input arguments, avoiding overflow
+ * in the intermediate sum.
+ *
+ * Compute average of signed input arguments \p x and \p y 
+ * as ( \p x + \p y ) >> 1, avoiding overflow in the intermediate sum.
+ *
+ * \return Returns a signed integer value representing the signed 
+ * average value of the two inputs.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __hadd(int x, int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Compute rounded average of signed input arguments, avoiding
+ * overflow in the intermediate sum.
+ *
+ * Compute average of signed input arguments \p x and \p y 
+ * as ( \p x + \p y + 1 ) >> 1, avoiding overflow in the intermediate
+ * sum.
+ *
+ * \return Returns a signed integer value representing the signed 
+ * rounded average value of the two inputs.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __rhadd(int x, int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Compute average of unsigned input arguments, avoiding overflow
+ * in the intermediate sum.
+ *
+ * Compute average of unsigned input arguments \p x and \p y 
+ * as ( \p x + \p y ) >> 1, avoiding overflow in the intermediate sum.
+ *
+ * \return Returns an unsigned integer value representing the unsigned 
+ * average value of the two inputs.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __uhadd(unsigned int x, unsigned int y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Compute rounded average of unsigned input arguments, avoiding
+ * overflow in the intermediate sum.
+ *
+ * Compute average of unsigned input arguments \p x and \p y 
+ * as ( \p x + \p y + 1 ) >> 1, avoiding overflow in the intermediate
+ * sum.
+ *
+ * \return Returns an unsigned integer value representing the unsigned 
+ * rounded average value of the two inputs.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __urhadd(unsigned int x, unsigned int y);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __double2int_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __double2uint_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          __double2ll_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-towards-zero mode.
+ * \return Returns converted value.
+ * \note_fp_to_int_out_of_range_undefined
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int __double2ull_rz(double x);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __pm0(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __pm1(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __pm2(void);
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           __pm3(void);
+
+/*******************************************************************************
+ *                                                                             *
+ *                        FP16 SIMD functions                                  *
+ *                                                                             *
+ *******************************************************************************/
+
+ //  #include "fp16.h"
+
+
+/*******************************************************************************
+ *                                                                             *
+ *                                SIMD functions                               *
+ *                                                                             *
+ *******************************************************************************/
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword absolute value: |a|.
+ *
+ * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes,
+ * then computes absolute value for each of parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabs2(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword absolute value with signed saturation: |a|.
+ *
+ * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes,
+ * then computes absolute value with signed saturation for each of parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsss2(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed addition, with wrap-around: a + b.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then performs unsigned addition on corresponding parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vadd2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword addition with signed saturation: a + b.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then performs addition with signed saturation on corresponding parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddss2 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword addition with unsigned saturation: a + b.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then performs addition with unsigned saturation on corresponding parts.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddus2 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed rounded average computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then computes signed rounded average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgs2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned rounded average computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then computes unsigned rounded average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned average computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes,
+ * then computes unsigned average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vhaddu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed comparison: a == b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if they are equal, and 0000 otherwise.
+ * For example __vcmpeq2(0x1234aba5, 0x1234aba6) returns 0xffff0000.
+ * \return Returns 0xffff computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpeq2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: a >= b ? 0xffff : 0.
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part >= 'b' part, and 0000 otherwise.
+ * For example __vcmpges2(0x1234aba5, 0x1234aba6) returns 0xffff0000.
+ * \return Returns 0xffff if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpges2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: a >= b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part >= 'b' part, and 0000 otherwise.
+ * For example __vcmpgeu2(0x1234aba5, 0x1234aba6) returns 0xffff0000.
+ * \return Returns 0xffff if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgeu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: a > b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part > 'b' part, and 0000 otherwise.
+ * For example __vcmpgts2(0x1234aba5, 0x1234aba6) returns 0x00000000.
+ * \return Returns 0xffff if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgts2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: a > b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part > 'b' part, and 0000 otherwise.
+ * For example __vcmpgtu2(0x1234aba5, 0x1234aba6) returns 0x00000000.
+ * \return Returns 0xffff if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgtu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: a <= b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part <= 'b' part, and 0000 otherwise.
+ * For example __vcmples2(0x1234aba5, 0x1234aba6) returns 0xffffffff.
+ * \return Returns 0xffff if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmples2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: a <= b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part <= 'b' part, and 0000 otherwise.
+ * For example __vcmpleu2(0x1234aba5, 0x1234aba6) returns 0xffffffff.
+ * \return Returns 0xffff if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpleu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: a < b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part < 'b' part, and 0000 otherwise.
+ * For example __vcmplts2(0x1234aba5, 0x1234aba6) returns 0x0000ffff.
+ * \return Returns 0xffff if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmplts2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: a < b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part < 'b' part, and 0000 otherwise.
+ * For example __vcmpltu2(0x1234aba5, 0x1234aba6) returns 0x0000ffff.
+ * \return Returns 0xffff if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpltu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed comparison: a != b ? 0xffff : 0.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts result is ffff if 'a' part != 'b' part, and 0000 otherwise.
+ * For example __vcmplts2(0x1234aba5, 0x1234aba6) returns 0x0000ffff.
+ * \return Returns 0xffff if a != b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpne2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword absolute difference of unsigned integer: |a - b|.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes absolute difference. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed maximum computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes signed maximum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxs2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned maximum computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes unsigned maximum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed minimum computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes signed minimum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmins2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned minimum computation.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes unsigned minimum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vminu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed comparison: returns 1 if both parts compare equal.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part == 'b' part.
+ * If both equalities are satisfied, function returns 1.
+ * \return Returns 1 if a = b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vseteq2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: returns 1 if both parts compare greater than or equal.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part >= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetges2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: returns 1 if both parts compare greater than or equal.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part >= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgeu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: returns 1 if both parts compare greater than.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part > 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgts2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: returns 1 if both parts compare greater than.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part > 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgtu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: returns 1 if both parts compare less than or equal.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetles2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: returns 1 if both parts compare less than or equal.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetleu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword signed comparison: returns 1 if both parts compare less than.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetlts2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword unsigned comparison: returns 1 if both parts compare less than.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetltu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed comparison: returns 1 if both parts compare not equal.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs comparison 'a' part != 'b' part.
+ * If both conditions are satisfied, function returns 1.
+ * \return Returns 1 if a != b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetne2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword sum of abs diff of unsigned.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes absolute differences and returns
+ * sum of those differences.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsadu2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed subtraction, with wrap-around: a - b.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs subtraction. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsub2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword (un)signed subtraction, with signed saturation: a - b.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs subtraction with signed saturation.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubss2 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword subtraction with unsigned saturation: a - b.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function performs subtraction with unsigned saturation.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubus2 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword negation.
+ *
+ * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes.
+ * For each part function computes negation. Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vneg2(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword negation with signed saturation.
+ *
+ * Splits 4 bytes of argument into 2 parts, each consisting of 2 bytes.
+ * For each part function computes negation. Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vnegss2(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-halfword absolute difference of signed integer: |a - b|.
+ *
+ * Splits 4 bytes of each into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes absolute difference.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffs2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword sum of absolute difference of signed.
+ *
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * For corresponding parts function computes absolute difference and sum it up.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsads2(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte absolute value: |a|.
+ *
+ * Splits argument by bytes. Computes absolute value of each byte.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabs4(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte absolute value with signed saturation: |a|.
+ *
+ * Splits 4 bytes of argument into 4 parts, each consisting of 1 byte,
+ * then computes absolute value with signed saturation for each of parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsss4(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte (un)signed addition: a + b.
+ *
+ * Splits 'a' into 4 bytes, then performs unsigned addition on each of these
+ * bytes with the corresponding byte from 'b', ignoring overflow.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vadd4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte addition with signed saturation: a + b.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte,
+ * then performs addition with signed saturation on corresponding parts.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddss4 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte addition with unsigned saturation: a + b.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte,
+ * then performs addition with unsigned saturation on corresponding parts.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vaddus4 (unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte signed rounded average.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * then computes signed rounded average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgs4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned rounded average.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * then computes unsigned rounded average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vavgu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte unsigned average.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * then computes unsigned average of corresponding parts. Partial results are
+ * recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vhaddu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte (un)signed comparison: a == b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if they are equal, and 00 otherwise.
+ * For example __vcmpeq4(0x1234aba5, 0x1234aba6) returns 0xffffff00.
+ * \return Returns 0xff if a = b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpeq4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: a >= b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part >= 'b' part, and 00 otherwise.
+ * For example __vcmpges4(0x1234aba5, 0x1234aba6) returns 0xffffff00.
+ * \return Returns 0xff if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpges4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: a >= b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part >= 'b' part, and 00 otherwise.
+ * For example __vcmpgeu4(0x1234aba5, 0x1234aba6) returns 0xffffff00.
+ * \return Returns 0xff if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgeu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: a > b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part > 'b' part, and 00 otherwise.
+ * For example __vcmpgts4(0x1234aba5, 0x1234aba6) returns 0x00000000.
+ * \return Returns 0xff if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgts4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: a > b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part > 'b' part, and 00 otherwise.
+ * For example __vcmpgtu4(0x1234aba5, 0x1234aba6) returns 0x00000000.
+ * \return Returns 0xff if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpgtu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: a <= b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part <= 'b' part, and 00 otherwise.
+ * For example __vcmples4(0x1234aba5, 0x1234aba6) returns 0xffffffff.
+ * \return Returns 0xff if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmples4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: a <= b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part <= 'b' part, and 00 otherwise.
+ * For example __vcmpleu4(0x1234aba5, 0x1234aba6) returns 0xffffffff.
+ * \return Returns 0xff if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpleu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: a < b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part < 'b' part, and 00 otherwise.
+ * For example __vcmplts4(0x1234aba5, 0x1234aba6) returns 0x000000ff.
+ * \return Returns 0xff if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmplts4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: a < b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part < 'b' part, and 00 otherwise.
+ * For example __vcmpltu4(0x1234aba5, 0x1234aba6) returns 0x000000ff.
+ * \return Returns 0xff if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpltu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte (un)signed comparison: a != b ? 0xff : 0.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts result is ff if 'a' part != 'b' part, and 00 otherwise.
+ * For example __vcmplts4(0x1234aba5, 0x1234aba6) returns 0x000000ff.
+ * \return Returns 0xff if a != b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vcmpne4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte absolute difference of unsigned integer: |a - b|.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes absolute difference. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte signed maximum.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes signed maximum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxs4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte unsigned maximum.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes unsigned maximum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmaxu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte signed minimum.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes signed minimum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vmins4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte unsigned minimum.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes unsigned minimum. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vminu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte (un)signed comparison: returns 1 if all 4 pairs compare equal.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part == 'b' part.
+ * If both equalities are satisfied, function returns 1.
+ * \return Returns 1 if a = b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vseteq4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: returns 1 if all 4 pairs compare less than or equal.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetles4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: returns 1 if all 4 pairs compare less than or equal.
+ *
+ * Splits 4 bytes of each argument into 4 part, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a <= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetleu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: returns 1 if all 4 pairs compare less than.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetlts4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: returns 1 if all 4 pairs compare less than.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part <= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a < b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetltu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: returns 1 if all 4 pairs compare greater than or equal.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part >= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetges4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: returns 1 if all 4 pairs compare greater than or equal.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part >= 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a >= b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgeu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte signed comparison: returns 1 if all 4 pairs compare greater than.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part > 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgts4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte unsigned comparison: returns 1 if all 4 pairs compare greater than.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part > 'b' part.
+ * If both inequalities are satisfied, function returns 1.
+ * \return Returns 1 if a > b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetgtu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte (un)signed comparison: returns 1 if all 4 pairs compare not equal.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs comparison 'a' part != 'b' part.
+ * If both conditions are satisfied, function returns 1.
+ * \return Returns 1 if a != b, else returns 0.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsetne4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte sum of abs difference of unsigned.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes absolute differences and returns
+ * sum of those differences.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsadu4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte subtraction: a - b.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs subtraction. Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsub4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte subtraction with signed saturation: a - b.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs subtraction with signed saturation.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubss4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte subtraction with unsigned saturation: a - b.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function performs subtraction with unsigned saturation.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsubus4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte negation.
+ *
+ * Splits 4 bytes of argument into 4 parts, each consisting of 1 byte.
+ * For each part function computes negation. Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vneg4(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-byte negation with signed saturation.
+ *
+ * Splits 4 bytes of argument into 4 parts, each consisting of 1 byte.
+ * For each part function computes negation. Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vnegss4(unsigned int a);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte absolute difference of signed integer: |a - b|.
+ *
+ * Splits 4 bytes of each into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes absolute difference.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vabsdiffs4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes per-byte sum of abs difference of signed.
+ *
+ * Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte.
+ * For corresponding parts function computes absolute difference and sum it up.
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int __vsads4(unsigned int a, unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(max(a, b), 0)
+ *
+ * Calculates the maximum of \p a and \p b of two signed ints, if this is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__ int __vimax_s32_relu(const int a, const int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(max(a, b), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a max with relu ( = max(a_part, b_part, 0) ). Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimax_s16x2_relu(const unsigned int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(min(a, b), 0)
+ *
+ * Calculates the minimum of \p a and \p b of two signed ints, if this is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vimin_s32_relu(const int a, const int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(min(a, b), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a min with relu ( = max(min(a_part, b_part), 0) ). Partial results
+ * are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimin_s16x2_relu(const unsigned int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(max(a, b), c)
+ * 
+ * Calculates the 3-way max of signed integers \p a, \p b and \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vimax3_s32(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(max(a, b), c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a 3-way max ( = max(max(a_part, b_part), c_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimax3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(max(a, b), c)
+ * 
+ * Calculates the 3-way max of unsigned integers \p a, \p b and \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimax3_u32(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(max(a, b), c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs a 3-way max ( = max(max(a_part, b_part), c_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimax3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(min(a, b), c)
+ * 
+ * Calculates the 3-way min of signed integers \p a, \p b and \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vimin3_s32(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(min(a, b), c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a 3-way min ( = min(min(a_part, b_part), c_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimin3_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(min(a, b), c)
+ * 
+ * Calculates the 3-way min of unsigned integers \p a, \p b and \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimin3_u32(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(min(a, b), c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs a 3-way min ( = min(min(a_part, b_part), c_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimin3_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(max(max(a, b), c), 0)
+ *
+ * Calculates the maximum of three signed ints, if this is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vimax3_s32_relu(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(max(max(a, b), c), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a three-way max with relu ( = max(a_part, b_part, c_part, 0) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimax3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(min(min(a, b), c), 0)
+ *
+ * Calculates the minimum of three signed ints, if this is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vimin3_s32_relu(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(min(min(a, b), c), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a three-way min with relu ( = max(min(a_part, b_part, c_part), 0) ).
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vimin3_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(a + b, c)
+ *
+ * Calculates the sum of signed integers \p a and \p b and takes the max with \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __viaddmax_s32(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(a + b, c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs an add and compare: max(a_part + b_part), c_part)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmax_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(a + b, c)
+ *
+ * Calculates the sum of unsigned integers \p a and \p b and takes the max with \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmax_u32(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(a + b, c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs an add and compare: max(a_part + b_part), c_part)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmax_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(a + b, c)
+ *
+ * Calculates the sum of signed integers \p a and \p b and takes the min with \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __viaddmin_s32(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(a + b, c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs an add and compare: min(a_part + b_part), c_part)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmin_s16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(a + b, c)
+ *
+ * Calculates the sum of unsigned integers \p a and \p b and takes the min with \p c.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmin_u32(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(a + b, c)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs an add and compare: min(a_part + b_part), c_part)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmin_u16x2(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(max(a + b, c), 0)
+ *
+ * Calculates the sum of signed integers \p a and \p b and takes the max with \p c.
+ * If the result is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __viaddmax_s32_relu(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(max(a + b, c), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs an add, followed by a max with relu: max(max(a_part + b_part), c_part), 0)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmax_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(min(a + b, c), 0)
+ *
+ * Calculates the sum of signed integers \p a and \p b and takes the min with \p c.
+ * If the result is less than \p 0 then \p 0 is returned.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __viaddmin_s32_relu(const int a, const int b, const int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(min(a + b, c), 0)
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs an add, followed by a min with relu: max(min(a_part + b_part), c_part), 0)
+ * Partial results are recombined and returned as unsigned int.
+ * \return Returns computed value.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __viaddmin_s16x2_relu(const unsigned int a, const unsigned int b, const unsigned int c);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(a, b), also sets the value pointed to by pred to (a >= b).
+ *
+ * Calculates the maximum of \p a and \p b of two signed ints. Also sets the value pointed to by \p pred to the value (a >= b).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vibmax_s32(const int a, const int b, bool* const pred);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes max(a, b), also sets the value pointed to by pred to (a >= b).
+ *
+ * Calculates the maximum of \p a and \p b of two unsigned ints. Also sets the value pointed to by \p pred to the value (a >= b).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmax_u32(const unsigned int a, const unsigned int b, bool* const pred);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(a, b), also sets the value pointed to by pred to (a <= b).
+ *
+ * Calculates the minimum of \p a and \p b of two signed ints. Also sets the value pointed to by \p pred to the value (a <= b).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  int __vibmin_s32(const int a, const int b, bool* const pred);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Computes min(a, b), also sets the value pointed to by pred to (a <= b).
+ *
+ * Calculates the minimum of \p a and \p b of two unsigned ints. Also sets the value pointed to by \p pred to the value (a <= b).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmin_u32(const unsigned int a, const unsigned int b, bool* const pred);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(a, b), also sets the value pointed to by pred_hi and pred_lo to the per-halfword result of (a >= b).
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a maximum ( = max(a_part, b_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * Sets the value pointed to by \p pred_hi to the value (a_high_part >= b_high_part).
+ * Sets the value pointed to by \p pred_lo to the value (a_low_part >= b_low_part).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmax_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword max(a, b), also sets the value pointed to by pred_hi and pred_lo to the per-halfword result of (a >= b).
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs a maximum ( = max(a_part, b_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * Sets the value pointed to by \p pred_hi to the value (a_high_part >= b_high_part).
+ * Sets the value pointed to by \p pred_lo to the value (a_low_part >= b_low_part).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmax_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(a, b), also sets the value pointed to by pred_hi and pred_lo to the per-halfword result of (a <= b).
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as signed shorts.
+ * For corresponding parts function performs a maximum ( = max(a_part, b_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * Sets the value pointed to by \p pred_hi to the value (a_high_part <= b_high_part).
+ * Sets the value pointed to by \p pred_lo to the value (a_low_part <= b_low_part).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmin_s16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SIMD
+ * \brief Performs per-halfword min(a, b), also sets the value pointed to by pred_hi and pred_lo to the per-halfword result of (a <= b).
+ * 
+ * Splits 4 bytes of each argument into 2 parts, each consisting of 2 bytes.
+ * These 2 byte parts are interpreted as unsigned shorts.
+ * For corresponding parts function performs a maximum ( = max(a_part, b_part) ).
+ * Partial results are recombined and returned as unsigned int.
+ * Sets the value pointed to by \p pred_hi to the value (a_high_part <= b_high_part).
+ * Sets the value pointed to by \p pred_lo to the value (a_low_part <= b_low_part).
+ * \return Returns computed values.
+ */
+__DEVICE_HOST_FUNCTIONS_STATIC_DECL__  unsigned int __vibmin_u16x2(const unsigned int a, const unsigned int b, bool* const pred_hi, bool* const pred_lo);
+
+/*******************************************************************************
+ *                                                                             *
+ *                            END SIMD functions                               *
+ *                                                                             *
+ *******************************************************************************/
+} //extern "c"
+#undef EXCLUDE_FROM_RTC
+
+#undef __DEVICE_FUNCTIONS_DECL__
+#undef __DEVICE_FUNCTIONS_STATIC_DECL__
+#undef __DEVICE_HOST_FUNCTIONS_STATIC_DECL__
+
+#endif /* __cplusplus && __CUDACC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC_RTC__)
+#include "device_functions.hpp"
+#endif /* !defined(__CUDACC_RTC__) */
+
+#include "device_atomic_functions.h"
+#include "device_double_functions.h"
+#include "sm_20_atomic_functions.h"
+#include "sm_32_atomic_functions.h"
+#include "sm_35_atomic_functions.h"
+#include "sm_60_atomic_functions.h"
+#include "sm_20_intrinsics.h"
+#include "sm_30_intrinsics.h"
+#include "sm_32_intrinsics.h"
+#include "sm_35_intrinsics.h"
+#include "sm_61_intrinsics.h"
+#include "sm_70_rt.h"
+#include "sm_80_rt.h"
+#include "sm_90_rt.h"
+#include "sm_100_rt.h"
+#ifndef __CUDACC_RTC_MINIMAL__
+#include "texture_indirect_functions.h"
+#include "surface_indirect_functions.h"
+#endif  /* !__CUDACC_RTC_MINIMAL__ */
+#include "cudacc_ext.h"
+
+#ifdef __CUDACC__
+extern "C" __host__ __device__  unsigned CUDARTAPI __cudaPushCallConfiguration(dim3 gridDim,
+                                      dim3 blockDim, 
+                                      size_t sharedMem = 0, 
+                                      struct CUstream_st *stream = 0);
+
+#if !defined(__CUDACC_RTC__) &&!defined(__NV_LEGACY_LAUNCH)
+extern "C" cudaError_t CUDARTAPI __cudaGetKernel(cudaKernel_t *, const void *);
+
+extern "C"  cudaError_t CUDARTAPI __cudaLaunchKernel(
+        cudaKernel_t kernel,
+        dim3 gridDim,
+        dim3 blockDim,
+        void **args,
+        size_t sharedMem,
+        cudaStream_t stream
+);
+
+extern "C" cudaError_t CUDARTAPI __cudaLaunchKernel_ptsz(
+        cudaKernel_t kernel,
+        dim3 gridDim,
+        dim3 blockDim,
+        void **args,
+        size_t sharedMem,
+        cudaStream_t stream
+);
+
+//referenced from compiler generated kernel launch code
+static inline cudaError_t __cudaLaunchKernel_helper(
+                                  cudaKernel_t kernel,
+                                  dim3 gridDim,
+                                  dim3 blockDim,
+                                  void **args,
+                                  size_t sharedMem,
+                                  cudaStream_t stream)
+{
+#if defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM)
+  return __cudaLaunchKernel_ptsz(kernel, gridDim, blockDim, args, sharedMem,
+                                 stream);
+#else  /* !__CUDART_API_PER_THREAD_DEFAULT_STREAM */
+  return __cudaLaunchKernel(kernel, gridDim, blockDim, args, sharedMem,
+                            stream);
+#endif  /* __CUDART_API_PER_THREAD_DEFAULT_STREAM */
+}
+#endif  /* !defined(__CUDACC_RTC__) && !defined(__NV_LEGACY_LAUNCH) */
+
+enum {
+  __NV_ATOMIC_RELAXED,
+  __NV_ATOMIC_CONSUME,
+  __NV_ATOMIC_ACQUIRE,
+  __NV_ATOMIC_RELEASE,
+  __NV_ATOMIC_ACQ_REL,
+  __NV_ATOMIC_SEQ_CST
+};
+
+enum {
+  __NV_THREAD_SCOPE_THREAD,
+  __NV_THREAD_SCOPE_BLOCK,
+  __NV_THREAD_SCOPE_CLUSTER,
+  __NV_THREAD_SCOPE_DEVICE,
+  __NV_THREAD_SCOPE_SYSTEM
+};
+
+#endif  /* __CUDACC__ */
+
+#endif /* !__DEVICE_FUNCTIONS_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/func_macro.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/func_macro.h
new file mode 100644
index 0000000000000000000000000000000000000000..633554a01aaabd1bca5ae278c276710f323d5d7b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/func_macro.h
@@ -0,0 +1,57 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2008-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/func_macro.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/func_macro.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__
+#endif
+
+#if !defined(__FUNC_MACRO_H__)
+#define __FUNC_MACRO_H__
+
+#if !defined(__CUDA_INTERNAL_COMPILATION__)
+
+#error -- incorrect inclusion of a cudart header file
+
+#endif /* !__CUDA_INTERNAL_COMPILATION__ */
+
+#if defined(__GNUC__)
+
+#define __func__(decl) \
+        inline decl
+
+#define __device_func__(decl) \
+        static __attribute__((__unused__)) decl
+
+#elif defined(_WIN32)
+
+#define __func__(decl) \
+        static inline decl
+
+#define __device_func__(decl) \
+        static decl
+
+#endif /* __GNUC__ */
+
+#endif /* __FUNC_MACRO_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_FUNC_MACRO_H__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/host_config.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/host_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..820b81c2945d8dcc241329673a558090a4922e52
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/host_config.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/host_config.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/host_config.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
+#endif
+
+#if !defined(__HOST_CONFIG_H__)
+#define __HOST_CONFIG_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC__)
+
+#if defined(__CUDACC_RTC__)
+
+#define _CRTIMP
+#define __THROW
+
+#else /* __CUDACC_RTC__ */
+
+/* check for host compilers that are compatible with nvcc */
+#if !defined(__GNUC__) && !defined(_WIN32)
+
+#error --- !!! UNSUPPORTED COMPILER !!! ---
+
+#endif /* !__GNUC__ && !_WIN32 */
+
+/* check invalid configurations */
+#if defined(__PGIC__)
+#if !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__)
+#error -- unsupported pgc++ configuration! pgc++ is supported only on Linux x86_64!
+#endif /* !defined(__GNUC__) || !defined(__LP64__) || !defined(__linux__) */
+#endif  /* defined(__PGIC__) */
+
+#if defined(__powerpc__)
+#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__)
+#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported!
+#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */
+#endif /* __powerpc__ */
+
+#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__)
+#error -- clang and clang++ are the only supported host compilers on Mac OS X!
+#endif /* __APPLE__ && __MACH__ && !__clang__ */
+
+
+/* check host compiler version  */
+#if !__NV_NO_HOST_COMPILER_CHECK
+
+#if defined(__ICC)
+
+#if (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && !(__ICC >= 1900 && __ICC <= 2021)) || !defined(__GNUC__) || !defined(__LP64__)
+
+#error -- unsupported ICC configuration! Only ICC 15.0, ICC 16.0, ICC 17.0, ICC 18.0, ICC 19.x and 20.x on Linux x86_64 are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+ 
+#endif /* (__ICC != 1500 && __ICC != 1600 && __ICC != 1700 && __ICC != 1800 && __ICC != 1900) || !__GNUC__ || !__LP64__ */
+
+#endif /* __ICC */
+
+#if defined(__GRCO_CLANG_COMPILER__)
+#if (__GRCO_CLANG_COMPILER__ == 1) && ((__clang_major__ < 16) || (__clang_major__ > 19))
+#error -- unsupported Grace clang version! The version must be 16.x to 19.x. The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+#endif  /* (__GRCO_CLANG_COMPILER__ == 1) && ((__clang_major__ < 16) || (__clang_major__ > 19)) */
+
+#endif /* __GRCO_CLANG_COMPILER__  */
+
+#if defined(__INTEL_CLANG_COMPILER)
+#error -- unsupported Intel ICX compiler! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+#endif /* __INTEL_CLANG_COMPILER */
+
+#if defined(__powerpc__)
+
+#if defined(__ibmxl_vrm__) && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) && \
+                              !(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000)
+
+#error -- unsupported xlC version! only xlC 13.1 and 16.1 are supported. The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#endif /* __ibmxl_vrm__ && !(__ibmxl_vrm__ >= 0x0d010000 && __ibmxl_vrm__ < 0x0d020000) &&
+                           !(__ibmxl_vrm__ >= 0x10010000 && __ibmxl_vrm__ < 0x10020000) */
+
+#endif /* __powerpc__ */
+
+#if defined(__GNUC__)
+
+#if __GNUC__ > 14
+
+#error -- unsupported GNU version! gcc versions later than 14 are not supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#endif /* __GNUC__ > 14 */
+
+
+#if defined(__HORIZON__)
+#if (__clang_major__ >= 20) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3))
+#error -- unsupported HOS clang version! The version must be must be less than 20 and greater than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+#endif  /* (__clang_major__ >= 20) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3)) */
+#endif /* __HORIZON__  */
+
+#if defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__) && !defined(__GRCO_CLANG_COMPILER__)
+
+#if (__clang_major__ >= 20) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3))
+#error -- unsupported clang version! clang version must be less than 20 and greater than 3.2 . The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#endif  /* (__clang_major__ >=  20) || (__clang_major__ < 3) || ((__clang_major__ == 3) &&  (__clang_minor__ < 3)) */
+
+#endif /* defined(__clang__) && !defined(__ibmxl_vrm__) && !defined(__ICC) && !defined(__HORIZON__) && !defined(__APPLE__) && !defined(__GRCO_CLANG_COMPILER__) */
+
+
+#endif /* __GNUC__ */
+
+#if defined(_WIN32)
+
+#if _MSC_VER < 1910 || _MSC_VER >= 1950
+
+#error -- unsupported Microsoft Visual Studio version! Only the versions between 2017 and 2022 (inclusive) are supported! The nvcc flag '-allow-unsupported-compiler' can be used to override this version check; however, using an unsupported host compiler may cause compilation failure or incorrect run time execution. Use at your own risk.
+
+#elif _MSC_VER >= 1910 && _MSC_VER < 1910
+
+#pragma message("support for this version of Microsoft Visual Studio has been deprecated! Only the versions between 2017 and 2022 (inclusive) are supported!")
+
+#endif /* (_MSC_VER < 1910 || _MSC_VER >= 1950) || (_MSC_VER >= 1910 && _MSC_VER < 1910) */
+
+#endif /* _WIN32 */
+#endif  /* !__NV_NO_HOST_COMPILER_CHECK */
+
+
+/* configure host compiler */
+#if defined(__APPLE__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#if defined(__BLOCKS__) /* nvcc does not support closures */
+
+#undef __BLOCKS__
+
+#endif /* __BLOCKS__ */
+
+#elif defined(__ANDROID__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#elif defined(__QNX__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#elif defined(__HORIZON__)
+
+#define _CRTIMP
+#define _ACRTIMP
+#define __THROW
+
+#elif defined(__GNUC__)
+
+#define _CRTIMP
+#define _ACRTIMP
+
+#include <features.h> /* for __THROW */
+
+#elif defined(_WIN32)
+
+#if _MSC_VER >= 1500
+
+#undef _USE_DECLSPECS_FOR_SAL
+#define _USE_DECLSPECS_FOR_SAL \
+        1
+
+#endif /* _MSC_VER >= 1500 */
+
+#if !defined(_CRT_NONSTDC_NO_WARNINGS)
+
+#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */
+
+#endif /* !_CRT_NONSTDC_NO_WARNINGS */
+
+#if !defined(_CRT_SECURE_NO_WARNINGS)
+
+#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */
+
+#endif /* !_CRT_SECURE_NO_WARNINGS */
+
+#if !defined(NOMINMAX)
+
+#define NOMINMAX /* min and max are part of cuda runtime */
+
+#endif /* !NOMINMAX */
+
+#include <crtdefs.h> /* for _CRTIMP */
+#if _MSC_VER >= 1900
+#include <corecrt.h> /* for _ACRTIMP */
+#endif /* _MSC_VER >= 1900 */
+
+#define __THROW
+
+#endif /* __APPLE__ */
+
+#endif /* __CUDACC_RTC__ */
+
+
+#if defined(__cplusplus) && defined(__CUDA_ARCH__) && (defined(__PGIC__) || defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER)))
+
+#if __CUDACC_RTC__
+typedef char *va_list;
+#else /* !__CUDACC_RTC__ */
+#include <cstdarg>
+#endif /* __CUDACC_RTC__ */
+
+
+#undef va_start
+#undef va_end
+#undef va_arg
+
+#ifdef __PGIC__
+
+#undef __builtin_va_end
+
+#define va_start(v,l) __builtin_alt_va_start(v,l)
+#define va_end(v) __builtin_va_end(v)
+#define va_arg(v,l) __builtin_alt_va_arg(v,l)
+
+#if (__cplusplus >= 201103L)
+#undef va_copy
+#define va_copy(d,s)  __builtin_va_copy(d,s)
+#endif
+
+#else /* !__PGIC__ */
+
+
+#define va_start(ap, x) (__cu_va_start(&ap, x))
+#define va_end(ap) (__cu_va_end(&ap))
+#define va_arg(ap, t)  (*((t *)__cu_va_arg(&ap, (t *)0)))
+
+#if (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L))
+#undef va_copy
+#define va_copy(apd, aps) (__cu_va_copy(&(apd), &(aps)))
+#endif /* (_MSC_VER >= 1800)  || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) */
+#endif /* __PGIC__ */
+
+#endif /* defined(__cplusplus) && (defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) */
+
+
+
+#endif /* __CUDACC__ */
+
+#endif /* !__HOST_CONFIG_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/host_runtime.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/host_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..22e3a1bea875ddb2a15075f6e0ecb10b7ce1a6a7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/host_runtime.h
@@ -0,0 +1,306 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2008-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/device_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/device_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__
+#endif
+
+#if !defined(__CUDA_INTERNAL_COMPILATION__)
+
+#define __CUDA_INTERNAL_COMPILATION__
+#define __text__
+#define __surf__
+#define __name__shadow_var(c, cpp) \
+        #c
+#define __name__text_var(c, cpp) \
+        #cpp
+#define __host__shadow_var(c, cpp) \
+        cpp
+#define __text_var(c, cpp) \
+        cpp
+#define __device_fun(fun) \
+        #fun
+#define __device_var(var) \
+        #var
+#define __device__text_var(c, cpp) \
+        #c
+#define __device__shadow_var(c, cpp) \
+        #c
+
+#if defined(_WIN32) && !defined(_WIN64)
+
+#define __pad__(f) \
+        f
+
+#else /* _WIN32 && !_WIN64 */
+
+#define __pad__(f)
+
+#endif /* _WIN32 && !_WIN64 */
+
+#include "builtin_types.h"
+#include "storage_class.h"
+
+#else /* !__CUDA_INTERNAL_COMPILATION__ */
+
+template <typename T>
+static inline T *__cudaAddressOf(T &val) 
+{
+    return (T *)((void *)(&(const_cast<char &>(reinterpret_cast<const volatile char &>(val)))));
+}
+
+#define __cudaRegisterBinary(X)                                                   \
+        __cudaFatCubinHandle = __cudaRegisterFatBinary((void*)&__fatDeviceText); \
+        { void (*callback_fp)(void **) =  (void (*)(void **))(X); (*callback_fp)(__cudaFatCubinHandle); __cudaRegisterFatBinaryEnd(__cudaFatCubinHandle); }\
+        atexit(__cudaUnregisterBinaryUtil)
+        
+#define __cudaRegisterVariable(handle, var, ext, size, constant, global) \
+        __cudaRegisterVar(handle, (char*)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
+#define __cudaRegisterManagedVariable(handle, var, ext, size, constant, global) \
+        __cudaRegisterManagedVar(handle, (void **)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
+
+#define __cudaRegisterGlobalTexture(handle, tex, dim, norm, ext) \
+        __cudaRegisterTexture(handle, (const struct textureReference*)&tex, (const void**)(void*)__device##tex, __name##tex, dim, norm, ext)
+#define __cudaRegisterGlobalSurface(handle, surf, dim, ext) \
+        __cudaRegisterSurface(handle, (const struct surfaceReference*)&surf, (const void**)(void*)__device##surf, __name##surf, dim, ext)
+#define __cudaRegisterEntry(handle, funptr, fun, thread_limit) \
+        __cudaRegisterFunction(handle, (const char*)funptr, (char*)__device_fun(fun), #fun, -1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0)
+
+extern "C" cudaError_t CUDARTAPI __cudaPopCallConfiguration(
+  dim3         *gridDim,
+  dim3         *blockDim,
+  size_t       *sharedMem,
+  void         *stream
+);
+
+#define __cudaLaunchPrologue(size) \
+        void * __args_arr[size]; \
+        int __args_idx = 0
+        
+#define __cudaSetupArg(arg, offset) \
+        __args_arr[__args_idx] = (void *)__cudaAddressOf(arg); ++__args_idx
+          
+#define __cudaSetupArgSimple(arg, offset) \
+        __args_arr[__args_idx] = (void *)(char *)&arg; ++__args_idx
+        
+#if defined(__GNUC__)
+#define __NV_ATTR_UNUSED_FOR_LAUNCH __attribute__((unused))
+#else  /* !__GNUC__ */
+#define __NV_ATTR_UNUSED_FOR_LAUNCH
+#endif  /* __GNUC__ */
+
+#ifdef __NV_LEGACY_LAUNCH
+/* the use of __args_idx in the expression below avoids host compiler warning about it being an
+   unused variable when the launch has no arguments */
+#define __cudaLaunch(fun) \
+        { volatile static char *__f __NV_ATTR_UNUSED_FOR_LAUNCH;  __f = fun; \
+          dim3 __gridDim, __blockDim;\
+          size_t __sharedMem; \
+          cudaStream_t __stream; \
+          if (__cudaPopCallConfiguration(&__gridDim, &__blockDim, &__sharedMem, &__stream) != cudaSuccess) \
+            return; \
+          if (__args_idx == 0) {\
+            (void)cudaLaunchKernel(fun, __gridDim, __blockDim, &__args_arr[__args_idx], __sharedMem, __stream);\
+          } else { \
+            (void)cudaLaunchKernel(fun, __gridDim, __blockDim, &__args_arr[0], __sharedMem, __stream);\
+          }\
+        }
+#else  /* !__NV_LEGACY_LAUNCH */
+#define __cudaLaunch(fun) \
+        { volatile static char *__f __NV_ATTR_UNUSED_FOR_LAUNCH;  __f = fun; \
+          static cudaKernel_t __handle = 0; \
+          volatile static bool __tmp __NV_ATTR_UNUSED_FOR_LAUNCH = (__cudaGetKernel(&__handle, (const void *)fun) == cudaSuccess); \
+          dim3 __gridDim, __blockDim;\
+          size_t __sharedMem; \
+          cudaStream_t __stream; \
+          if (__cudaPopCallConfiguration(&__gridDim, &__blockDim, &__sharedMem, &__stream) != cudaSuccess) \
+            return; \
+          if (__args_idx == 0) {\
+            (void)__cudaLaunchKernel_helper(__handle, __gridDim, __blockDim, &__args_arr[__args_idx], __sharedMem, __stream);\
+          } else { \
+            (void)__cudaLaunchKernel_helper(__handle, __gridDim, __blockDim, &__args_arr[0], __sharedMem, __stream);\
+          }\
+        }
+#endif  /* __NV_LEGACY_LAUNCH */
+
+#if defined(__GNUC__)
+#define __nv_dummy_param_ref(param) \
+        { volatile static void **__ref __attribute__((unused)); __ref = (volatile void **)param; }
+#else /* __GNUC__ */
+#define __nv_dummy_param_ref(param) \
+        { volatile static void **__ref; __ref = (volatile void **)param; }
+#endif /* __GNUC__ */
+
+static void ____nv_dummy_param_ref(void *param) __nv_dummy_param_ref(param)
+
+#define __REGISTERFUNCNAME_CORE(X) __cudaRegisterLinkedBinary##X
+#define __REGISTERFUNCNAME(X) __REGISTERFUNCNAME_CORE(X)
+
+extern "C" {
+void __REGISTERFUNCNAME( __NV_MODULE_ID ) ( void (*)(void **), void *, void *, void (*)(void *));
+}
+
+#define __TO_STRING_CORE(X) #X
+#define __TO_STRING(X) __TO_STRING_CORE(X)
+
+extern "C" {
+#if defined(_WIN32)
+#pragma data_seg("__nv_module_id")
+  static const __declspec(allocate("__nv_module_id")) unsigned char __module_id_str[] = __TO_STRING(__NV_MODULE_ID);
+#pragma data_seg()
+#elif defined(__APPLE__)
+  static const unsigned char __module_id_str[] __attribute__((section ("__NV_CUDA,__nv_module_id"))) = __TO_STRING(__NV_MODULE_ID);
+#else
+  static const unsigned char __module_id_str[] __attribute__((section ("__nv_module_id"))) = __TO_STRING(__NV_MODULE_ID);
+#endif
+
+#undef __FATIDNAME_CORE
+#undef __FATIDNAME
+#define __FATIDNAME_CORE(X) __fatbinwrap##X
+#define __FATIDNAME(X) __FATIDNAME_CORE(X)
+
+#define  ____cudaRegisterLinkedBinary(X) \
+{ __REGISTERFUNCNAME(__NV_MODULE_ID) (( void (*)(void **))(X), (void *)&__FATIDNAME(__NV_MODULE_ID), (void *)&__module_id_str, (void (*)(void *))&____nv_dummy_param_ref); }
+
+}
+
+extern "C" {
+extern void** CUDARTAPI __cudaRegisterFatBinary(
+  void *fatCubin
+);
+
+extern void CUDARTAPI __cudaRegisterFatBinaryEnd(
+  void **fatCubinHandle
+);
+
+extern void CUDARTAPI __cudaUnregisterFatBinary(
+  void **fatCubinHandle
+);
+
+extern void CUDARTAPI __cudaRegisterVar(
+        void **fatCubinHandle,
+        char  *hostVar,
+        char  *deviceAddress,
+  const char  *deviceName,
+        int    ext,
+        size_t size,
+        int    constant,
+        int    global
+);
+
+extern void CUDARTAPI __cudaRegisterManagedVar(
+        void **fatCubinHandle,
+        void **hostVarPtrAddress,
+        char  *deviceAddress,
+  const char  *deviceName,
+        int    ext,
+        size_t size,
+        int    constant,
+        int    global
+);
+
+extern char CUDARTAPI __cudaInitModule(
+        void **fatCubinHandle
+);
+
+extern void CUDARTAPI __cudaRegisterTexture(
+        void                    **fatCubinHandle,
+  const struct textureReference  *hostVar,
+  const void                    **deviceAddress,
+  const char                     *deviceName,
+        int                       dim,       
+        int                       norm,      
+        int                        ext        
+);
+
+extern void CUDARTAPI __cudaRegisterSurface(
+        void                    **fatCubinHandle,
+  const struct surfaceReference  *hostVar,
+  const void                    **deviceAddress,
+  const char                     *deviceName,
+        int                       dim,       
+        int                       ext        
+);
+
+extern void CUDARTAPI __cudaRegisterFunction(
+        void   **fatCubinHandle,
+  const char    *hostFun,
+        char    *deviceFun,
+  const char    *deviceName,
+        int      thread_limit,
+        uint3   *tid,
+        uint3   *bid,
+        dim3    *bDim,
+        dim3    *gDim,
+        int     *wSize
+);
+
+#if defined(__APPLE__)
+extern "C" int atexit(void (*)(void));
+
+#elif  defined(__GNUC__) && !defined(__ANDROID__) && !defined(__HORIZON__)
+extern int atexit(void(*)(void)) throw();
+
+#elif defined(__HORIZON__)
+
+// __TEMP_WAR__ 200132570 HOS : Disable atexit call until it works
+#define atexit(p)
+
+#else /* __GNUC__ && !__ANDROID__ */
+extern int __cdecl atexit(void(__cdecl *)(void));
+#endif
+
+}
+
+static void **__cudaFatCubinHandle;
+
+static void __cdecl __cudaUnregisterBinaryUtil(void)
+{
+  ____nv_dummy_param_ref((void *)&__cudaFatCubinHandle);
+  __cudaUnregisterFatBinary(__cudaFatCubinHandle);
+}
+
+static char __nv_init_managed_rt_with_module(void **handle)
+{
+  return __cudaInitModule(handle);
+}
+
+#include "common_functions.h"
+
+#pragma pack()
+
+#if defined(_WIN32)
+
+#pragma warning(disable: 4099)
+
+#if !defined(_WIN64)
+
+#pragma warning(disable: 4408)
+
+#endif /* !_WIN64 */
+
+#endif /* _WIN32 */
+
+#endif /* !__CUDA_INTERNAL_COMPILATION__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_RUNTIME_H__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/math_functions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/math_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8201f97efb3aed940f62360d90899a5171eeb0d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/math_functions.h
@@ -0,0 +1,6257 @@
+/*
+ * Copyright 1993-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/math_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/math_functions.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H__
+#endif
+
+#if !defined(__MATH_FUNCTIONS_H__)
+#define __MATH_FUNCTIONS_H__
+
+#if defined(__QNX__) && (__GNUC__ >= 5) && defined(__CUDACC__)
+#if __has_include(<__config>)
+#include <__config>
+#endif
+#endif
+
+/**
+ * \defgroup CUDA_MATH Mathematical Functions
+ *
+ * CUDA mathematical functions are always available in device code.
+ *
+ * Host implementations of the common mathematical functions are mapped
+ * in a platform-specific way to standard math library functions, provided
+ * by the host compiler and respective host libm where available.
+ * Some functions, not available with the host compilers, are implemented
+ * in crt/math_functions.hpp header file.
+ * For example, see ::erfinv(). Other, less common functions,
+ * like ::rhypot(), ::cyl_bessel_i0() are only available in device code.
+ *
+ * CUDA Math device functions are no-throw for well-formed CUDA programs.
+ *
+ * Note that many floating-point and integer functions names are
+ * overloaded for different argument types. For example, the ::log()
+ * function has the following prototypes:
+ * \code
+ * double log(double x);
+ * float log(float x);
+ * float logf(float x);
+ * \endcode
+ *
+ * Note also that due to implementation constraints, certain math functions
+ * from std:: namespace may be callable in device code even via explicitly
+ * qualified std:: names. However, such use is discouraged, since this
+ * capability is unsupported, unverified, undocumented, not portable, and
+ * may change without notice.
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "host_defines.h"
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+extern "C"
+{
+
+/**
+ * @{
+ */
+
+/* Define math function DOXYGEN toplevel groups, functions will
+   be added to these groups later.
+*/
+/**
+ * \defgroup CUDA_MATH_SINGLE Single Precision Mathematical Functions
+ * This section describes single precision mathematical functions.
+ * To use these functions, you do not need to include any additional 
+ * header file in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_DOUBLE Double Precision Mathematical Functions
+ * This section describes double precision mathematical functions.
+ * To use these functions, you do not need to include any additional 
+ * header file in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_INT Integer Mathematical Functions
+ * This section describes integer mathematical functions.
+ * To use these functions, you do not need to include any additional
+ * header file in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_SINGLE Single Precision Intrinsics
+ * This section describes single precision intrinsic functions that are
+ * only supported in device code.
+ * To use these functions, you do not need to include any additional 
+ * header file in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_DOUBLE Double Precision Intrinsics
+ * This section describes double precision intrinsic functions that are
+ * only supported in device code.
+ * To use these functions, you do not need to include any additional 
+ * header file in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_INT Integer Intrinsics
+ * This section describes integer intrinsic functions. All of these
+ * functions are supported in device code. For some of the functions,
+ * host-specific implementations are also provided. For example, 
+ * see `::__nv_bswap16()`.
+ * To use these functions, you do not need to include any additional 
+ * header file in your program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_CAST Type Casting Intrinsics
+ * This section describes type casting intrinsic functions that are
+ * only supported in device code.
+ * To use these functions, you do not need to include any additional 
+ * header file in your program.
+ */
+
+/**
+ *
+ * \defgroup CUDA_MATH_INTRINSIC_SIMD SIMD Intrinsics
+ * This section describes SIMD intrinsic functions that are
+ * only supported in device code.
+ * To use these functions, you do not need to include any additional 
+ * header file in your program.
+ */
+
+
+/**
+ * @}
+ */
+#define __DEVICE_FUNCTIONS_DECL__ __host__ __device__
+#if !defined(_MSC_VER)
+#define __CUDA_MATH_CRTIMP
+#else
+#if _MSC_VER < 1900
+#define __CUDA_MATH_CRTIMP _CRTIMP
+#else
+#define __CUDA_MATH_CRTIMP _ACRTIMP
+#endif
+#endif
+
+#if defined(__ANDROID__) && (__ANDROID_API__ <= 20) && !defined(__aarch64__)
+static __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ int                    abs(int);
+static __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ long int               labs(long int);
+static __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ long long int          llabs(long long int);
+#else /* __ANDROID__ */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the absolute value of the input \p int argument.
+ *
+ * Calculate the absolute value of the input argument \p a.
+ *
+ * \return
+ * Returns the absolute value of the input argument.
+ * - abs(\p INT_MIN) is \p Undefined
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ int            __cdecl abs(int a) __THROW;
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the absolute value of the input \p long \p int argument.
+ *
+ * Calculate the absolute value of the input argument \p a.
+ *
+ * \return
+ * Returns the absolute value of the input argument.
+ * - labs(\p LONG_MIN) is \p Undefined
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ long int       __cdecl labs(long int a) __THROW;
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the absolute value of the input \p long \p long \p int argument.
+ *
+ * Calculate the absolute value of the input argument \p a.
+ *
+ * \return
+ * Returns the absolute value of the input argument.
+ * - llabs(\p LLONG_MIN) is \p Undefined
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __cudart_builtin__ long long int          llabs(long long int a) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+}
+#endif
+#endif /* __ANDROID__ */
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+/* put all math functions in std */
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the absolute value of the input argument.
+ *
+ * Calculate the absolute value of the input argument \p x.
+ *
+ * \return
+ * Returns the absolute value of the input argument.
+ * - fabs(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - fabs(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns +0.
+ * - fabs(NaN) returns an unspecified NaN.
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl fabs(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the absolute value of its argument
+ *
+ * Calculate the absolute value of the input argument \p x.
+ *
+ * \return
+ * Returns the absolute value of its argument.
+ * - fabsf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - fabsf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns +0.
+ * - fabsf(NaN) returns an unspecified NaN.
+ *
+ * \note_accuracy_single
+ */
+#if defined(_WIN32) && defined(_M_ARM64)
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float  __cdecl    fabsf(float x) __THROW;
+#else
+extern                    __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float             fabsf(float x) __THROW;
+#endif
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    min(const int a, const int b);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           umin(const unsigned int a, const unsigned int b);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llmin(const long long int a, const long long int b);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int ullmin(const unsigned long long int a, const unsigned long long int b);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Determine the minimum numeric value of the arguments.
+ *
+ * Determines the minimum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data. If one argument is a NaN and the other is legitimate numeric
+ * value, the numeric value is chosen.
+ *
+ * \return
+ * Returns the minimum numeric value of the arguments \p x and \p y.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fminf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl fminf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Determine the minimum numeric value of the arguments.
+ *
+ * Determines the minimum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data. If one argument is a NaN and the other is legitimate numeric
+ * value, the numeric value is chosen.
+ *
+ * \return
+ * Returns the minimum numeric value of the arguments \p x and \p y.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 fmin(double x, double y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl fmin(double x, double y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    max(const int a, const int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned int           umax(const unsigned int a, const unsigned int b);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llmax(const long long int a, const long long int b);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ unsigned long long int ullmax(const unsigned long long int a, const unsigned long long int b);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Determine the maximum numeric value of the arguments.
+ *
+ * Determines the maximum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data. If one argument is a NaN and the other is legitimate numeric
+ * value, the numeric value is chosen.
+ *
+ * \return
+ * Returns the maximum numeric values of the arguments \p x and \p y.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fmaxf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl fmaxf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Determine the maximum numeric value of the arguments.
+ *
+ * Determines the maximum numeric value of the arguments \p x and \p y. Treats NaN 
+ * arguments as missing data. If one argument is a NaN and the other is legitimate numeric
+ * value, the numeric value is chosen.
+ *
+ * \return
+ * Returns the maximum numeric values of the arguments \p x and \p y.
+ * - If both arguments are NaN, returns NaN.
+ * - If one argument is NaN, returns the numeric argument.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 fmax(double, double) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl fmax(double, double);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the sine of the input argument.
+ *
+ * Calculate the sine of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - sin(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sin(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - sin(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl sin(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the cosine of the input argument.
+ *
+ * Calculate the cosine of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - cos(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - cos(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - cos(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl cos(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the sine and cosine of the first input argument.
+ *
+ * Calculate the sine and cosine of the first input argument \p x (measured 
+ * in radians). The results for sine and cosine are written into the
+ * second argument, \p sptr, and, respectively, third argument, \p cptr.
+ *
+ * \see ::sin() and ::cos().
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   sincos(double x, double *sptr, double *cptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the sine and cosine of the first input argument.
+ *
+ * Calculate the sine and cosine of the first input argument \p x (measured
+ * in radians). The results for sine and cosine are written into the second 
+ * argument, \p sptr, and, respectively, third argument, \p cptr.
+ *
+ * \see ::sinf() and ::cosf().
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   sincosf(float x, float *sptr, float *cptr) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the tangent of the input argument.
+ *
+ * Calculate the tangent of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - tan(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - tan(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - tan(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl tan(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the square root of the input argument.
+ *
+ * Calculate the nonnegative square root of \p x, 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ *
+ * \return 
+ * Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ * - sqrt(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sqrt(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - sqrt(\p x) returns NaN if \p x is less than 0.
+ * - sqrt(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl sqrt(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the reciprocal of the square root of the input argument.
+ *
+ * Calculate the reciprocal of the nonnegative square root of \p x, 
+ * \cuda_math_formula 1/\sqrt{x} \end_cuda_math_formula.
+ *
+ * \return 
+ * Returns 
+ * \cuda_math_formula 1/\sqrt{x} \end_cuda_math_formula.
+ * - rsqrt(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - rsqrt(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - rsqrt(\p x) returns NaN if \p x is less than 0.
+ * - rsqrt(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rsqrt(double x);
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the reciprocal of the square root of the input argument.
+ *
+ * Calculate the reciprocal of the nonnegative square root of \p x, 
+ * \cuda_math_formula 1/\sqrt{x} \end_cuda_math_formula.
+ *
+ * \return 
+ * Returns 
+ * \cuda_math_formula 1/\sqrt{x} \end_cuda_math_formula.
+ * - rsqrtf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - rsqrtf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - rsqrtf(\p x) returns NaN if \p x is less than 0.
+ * - rsqrtf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rsqrtf(float x);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 2 logarithm of the input argument.
+ *
+ * Calculate the base 2 logarithm of the input argument \p x.
+ *
+ * \return 
+ * - log2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - log2(1) returns +0.
+ * - log2(\p x) returns NaN for \p x < 0.
+ * - log2(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - log2(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 log2(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl log2(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 2 exponential of the input argument.
+ * 
+ * Calculate
+ * \cuda_math_formula 2^x \end_cuda_math_formula
+,
+ * the base 2 exponential of the input argument \p x.
+ *
+ * \return
+ * - exp2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - exp2(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - exp2(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - exp2(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 exp2(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl exp2(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 2 exponential of the input argument.
+ *
+ * Calculate
+ * \cuda_math_formula 2^x \end_cuda_math_formula
+,
+ * the base 2 exponential of the input argument \p x.
+ *
+ * \return
+ * - exp2f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - exp2f(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - exp2f(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - exp2f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  exp2f(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl exp2f(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 10 exponential of the input argument.
+ *
+ * Calculate
+ * \cuda_math_formula 10^x \end_cuda_math_formula
+,
+ * the base 10 exponential of the input argument \p x.
+ *
+ * \return
+ * - exp10(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - exp10(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - exp10(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - exp10(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */         
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 exp10(double x) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 10 exponential of the input argument.
+ *
+ * Calculate
+ * \cuda_math_formula 10^x \end_cuda_math_formula
+,
+ * the base 10 exponential of the input argument \p x.
+ *
+ * \return
+ * - exp10f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - exp10f(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - exp10f(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - exp10f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  exp10f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument, minus 1.
+ *
+ * Calculate
+ * \cuda_math_formula e^x \end_cuda_math_formula
+ * -1, the base
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument \p x, minus 1.
+ *
+ * \return
+ * - expm1(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - expm1(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns -1.
+ * - expm1(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - expm1(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 expm1(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl expm1(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument, minus 1.
+ *
+ * Calculate
+ * \cuda_math_formula e^x \end_cuda_math_formula
+ * -1, the base
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument \p x, minus 1.
+ *
+ * \return
+ * - expm1f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - expm1f(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns -1.
+ * - expm1f(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - expm1f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  expm1f(float x) __THROW;        
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl expm1f(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 2 logarithm of the input argument.
+ *
+ * Calculate the base 2 logarithm of the input argument \p x.
+ *
+ * \return
+ * - log2f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - log2f(1) returns +0.
+ * - log2f(\p x) returns NaN for \p x < 0.
+ * - log2f(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - log2f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  log2f(float x) __THROW;         
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl log2f(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 10 logarithm of the input argument.
+ *
+ * Calculate the base 10 logarithm of the input argument \p x.
+ *
+ * \return
+ * - log10(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - log10(1) returns +0.
+ * - log10(\p x) returns NaN for \p x < 0.
+ * - log10(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - log10(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl log10(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  logarithm of the input argument.
+ *
+ * Calculate the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  logarithm of the input argument \p x.
+ *
+ * \return
+ * - log(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - log(1) returns +0.
+ * - log(\p x) returns NaN for \p x < 0.
+ * - log(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - log(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl log(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of 
+ * \cuda_math_formula \log_{e}(1+x) \end_cuda_math_formula.
+ *
+ * Calculate the value of 
+ * \cuda_math_formula \log_{e}(1+x) \end_cuda_math_formula
+ * of the input argument \p x.
+ *
+ * \return
+ * - log1p(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - log1p(-1) returns
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - log1p(\p x) returns NaN for \p x < -1.
+ * - log1p(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - log1p(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 log1p(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl log1p(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of 
+ * \cuda_math_formula \log_{e}(1+x) \end_cuda_math_formula.
+ *
+ * Calculate the value of 
+ * \cuda_math_formula \log_{e}(1+x) \end_cuda_math_formula
+ * of the input argument \p x.
+ *
+ * \return
+ * - log1pf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - log1pf(-1) returns
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - log1pf(\p x) returns NaN for \p x < -1.
+ * - log1pf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - log1pf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  log1pf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl log1pf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the largest integer less than or equal to \p x.
+ * 
+ * Calculates the largest integer value which is less than or equal to \p x.
+ * 
+ * \return
+ * Returns 
+ * \cuda_math_formula \lfloor x \rfloor \end_cuda_math_formula
+ *  expressed as a floating-point number.
+ * - floor(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - floor(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - floor(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl floor(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument.
+ *
+ * Calculate
+ * \cuda_math_formula e^x \end_cuda_math_formula
+,
+ * the base
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument \p x.
+ *
+ * \return
+ * - exp(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - exp(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - exp(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - exp(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl exp(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the hyperbolic cosine of the input argument.
+ *
+ * Calculate the hyperbolic cosine of the input argument \p x.
+ *
+ * \return
+ * - cosh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - cosh(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - cosh(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl cosh(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the hyperbolic sine of the input argument.
+ *
+ * Calculate the hyperbolic sine of the input argument \p x.
+ *
+ * \return
+ * - sinh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sinh(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - sinh(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl sinh(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the hyperbolic tangent of the input argument.
+ *
+ * Calculate the hyperbolic tangent of the input argument \p x.
+ *
+ * \return
+ * - tanh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - tanh( 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula.
+ * - tanh(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl tanh(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the nonnegative inverse hyperbolic cosine of the input argument.
+ *
+ * Calculate the nonnegative inverse hyperbolic cosine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in the interval [0, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ].
+ * - acosh(1) returns 0.
+ * - acosh(\p x) returns NaN for \p x in the interval [
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , 1).
+ * - acosh( 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - acosh(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 acosh(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl acosh(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the nonnegative inverse hyperbolic cosine of the input argument.
+ *
+ * Calculate the nonnegative inverse hyperbolic cosine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in the interval [0, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ].
+ * - acoshf(1) returns 0.
+ * - acoshf(\p x) returns NaN for \p x in the interval [
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , 1).
+ * - acoshf( 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - acoshf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  acoshf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl acoshf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the inverse hyperbolic sine of the input argument.
+ *
+ * Calculate the inverse hyperbolic sine of the input argument \p x.
+ *
+ * \return
+ * - asinh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - asinh(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula. 
+ * - asinh(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 asinh(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl asinh(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the inverse hyperbolic sine of the input argument.
+ *
+ * Calculate the inverse hyperbolic sine of the input argument \p x.
+ *
+ * \return 
+ * - asinhf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula. 
+ * - asinhf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - asinhf(NaN) returns NaN.
+ * 
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  asinhf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl asinhf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the inverse hyperbolic tangent of the input argument.
+ *
+ * Calculate the inverse hyperbolic tangent of the input argument \p x.
+ *
+ * \return 
+ * - atanh(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - atanh(
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - atanh(\p x) returns NaN for \p x outside interval [-1, 1].
+ * - atanh(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 atanh(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl atanh(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the inverse hyperbolic tangent of the input argument.
+ *
+ * Calculate the inverse hyperbolic tangent of the input argument \p x.
+ *
+ * \return 
+ * - atanhf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - atanhf(
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - atanhf(\p x) returns NaN for \p x outside interval [-1, 1].
+ * - atanhf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  atanhf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl atanhf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of 
+ * \cuda_math_formula x\cdot 2^{exp} \end_cuda_math_formula.
+ *
+ * Calculate the value of 
+ * \cuda_math_formula x\cdot 2^{exp} \end_cuda_math_formula
+ *  of the input arguments \p x and \p exp.
+ *
+ * \return 
+ * - ldexp(\p x, \p exp) is equivalent to scalbn(\p x, \p exp).
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl ldexp(double x, int exp) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of 
+ * \cuda_math_formula x\cdot 2^{exp} \end_cuda_math_formula.
+ *
+ * Calculate the value of 
+ * \cuda_math_formula x\cdot 2^{exp} \end_cuda_math_formula
+ *  of the input arguments \p x and \p exp.
+ *
+ * \return 
+ * - ldexpf(\p x, \p exp) is equivalent to scalbnf(\p x, \p exp).
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  ldexpf(float x, int exp) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the floating-point representation of the exponent of the input argument.
+ *
+ * Calculate the floating-point representation of the exponent of the input argument \p x.
+ *
+ * \return 
+ * - logb(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - logb(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - logb(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 logb(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl logb(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the floating-point representation of the exponent of the input argument.
+ *
+ * Calculate the floating-point representation of the exponent of the input argument \p x.
+ *
+ * \return 
+ * - logbf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - logbf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - logbf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  logbf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl logbf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Compute the unbiased integer exponent of the argument.
+ *
+ * Calculates the unbiased integer exponent of the input argument \p x.
+ *
+ * \return
+ * - If successful, returns the unbiased exponent of the argument.
+ * - ilogb(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns <tt>INT_MIN</tt>.
+ * - ilogb(NaN) returns <tt>INT_MIN</tt>.
+ * - ilogb(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns <tt>INT_MAX</tt>.
+ * - Note: above behavior does not take into account <tt>FP_ILOGB0</tt> nor <tt>FP_ILOGBNAN</tt>.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    ilogb(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP int    __cdecl ilogb(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Compute the unbiased integer exponent of the argument.
+ *
+ * Calculates the unbiased integer exponent of the input argument \p x.
+ *
+ * \return
+ * - If successful, returns the unbiased exponent of the argument.
+ * - ilogbf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns <tt>INT_MIN</tt>.
+ * - ilogbf(NaN) returns <tt>INT_MIN</tt>.
+ * - ilogbf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns <tt>INT_MAX</tt>.
+ * - Note: above behavior does not take into account <tt>FP_ILOGB0</tt> nor <tt>FP_ILOGBNAN</tt>.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    ilogbf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP int    __cdecl ilogbf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Scale floating-point input by integer power of two.
+ *
+ * Scale \p x by 
+ * \cuda_math_formula 2^n \end_cuda_math_formula
+ *  by efficient manipulation of the floating-point
+ * exponent.
+ *
+ * \return 
+ * Returns \p x * 
+ * \cuda_math_formula 2^n \end_cuda_math_formula.
+ * - scalbn(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - scalbn(\p x, 0) returns \p x.
+ * - scalbn(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - scalbn(NaN, \p n) returns NaN.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 scalbn(double x, int n) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl scalbn(double x, int n);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Scale floating-point input by integer power of two.
+ *
+ * Scale \p x by 
+ * \cuda_math_formula 2^n \end_cuda_math_formula
+ *  by efficient manipulation of the floating-point
+ * exponent.
+ *
+ * \return 
+ * Returns \p x * 
+ * \cuda_math_formula 2^n \end_cuda_math_formula.
+ * - scalbnf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - scalbnf(\p x, 0) returns \p x.
+ * - scalbnf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - scalbnf(NaN, \p n) returns NaN.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  scalbnf(float x, int n) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl scalbnf(float x, int n);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Scale floating-point input by integer power of two.
+ *
+ * Scale \p x by 
+ * \cuda_math_formula 2^n \end_cuda_math_formula
+ *  by efficient manipulation of the floating-point
+ * exponent.
+ *
+ * \return 
+ * Returns \p x * 
+ * \cuda_math_formula 2^n \end_cuda_math_formula.
+ * - scalbln(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - scalbln(\p x, 0) returns \p x.
+ * - scalbln(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - scalbln(NaN, \p n) returns NaN.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 scalbln(double x, long int n) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl scalbln(double x, long int n);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Scale floating-point input by integer power of two.
+ *
+ * Scale \p x by 
+ * \cuda_math_formula 2^n \end_cuda_math_formula
+ *  by efficient manipulation of the floating-point
+ * exponent.
+ *
+ * \return 
+ * Returns \p x * 
+ * \cuda_math_formula 2^n \end_cuda_math_formula.
+ * - scalblnf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - scalblnf(\p x, 0) returns \p x.
+ * - scalblnf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p n) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - scalblnf(NaN, \p n) returns NaN.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  scalblnf(float x, long int n) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl scalblnf(float x, long int n);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Extract mantissa and exponent of a floating-point value
+ * 
+ * Decompose the floating-point value \p x into a component \p m for the 
+ * normalized fraction element and another term \p n for the exponent.
+ * The absolute value of \p m will be greater than or equal to  0.5 and 
+ * less than 1.0 or it will be equal to 0; 
+ * \cuda_math_formula x = m\cdot 2^n \end_cuda_math_formula.
+ * The integer exponent \p n will be stored in the location to which \p nptr points.
+ *
+ * \return
+ * Returns the fractional component \p m.
+ * - frexp(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p nptr) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  and stores zero in the location pointed to by \p nptr.
+ * - frexp(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p nptr) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *  and stores an unspecified value in the 
+ * location to which \p nptr points.
+ * - frexp(NaN, \p y) returns a NaN and stores an unspecified value in the location to which \p nptr points.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl frexp(double x, int *nptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Extract mantissa and exponent of a floating-point value
+ * 
+ * Decomposes the floating-point value \p x into a component \p m for the 
+ * normalized fraction element and another term \p n for the exponent.
+ * The absolute value of \p m will be greater than or equal to  0.5 and 
+ * less than 1.0 or it will be equal to 0; 
+ * \cuda_math_formula x = m\cdot 2^n \end_cuda_math_formula.
+ * The integer exponent \p n will be stored in the location to which \p nptr points.
+ *
+ * \return
+ * Returns the fractional component \p m.
+ * - frexpf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p nptr) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  and stores zero in the location pointed to by \p nptr.
+ * - frexpf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p nptr) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *  and stores an unspecified value in the 
+ * location to which \p nptr points.
+ * - frexpf(NaN, \p y) returns a NaN and stores an unspecified value in the location to which \p nptr points.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  frexpf(float x, int *nptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round to nearest integer value in floating-point.
+ *
+ * Round \p x to the nearest integer value in floating-point format,
+ * with halfway cases rounded away from zero.
+ *
+ * \return 
+ * Returns rounded integer value.
+ * - round(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - round(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - round(NaN) returns NaN.
+ *
+ * \note_slow_round See ::rint().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 round(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl round(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round to nearest integer value in floating-point.
+ *
+ * Round \p x to the nearest integer value in floating-point format,
+ * with halfway cases rounded away from zero.
+ *
+ * \return
+ * Returns rounded integer value.
+ * - roundf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - roundf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - roundf(NaN) returns NaN.
+ *
+ * \note_slow_round See ::rintf().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  roundf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl roundf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, with halfway cases rounded 
+ * away from zero.  If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ *
+ * \note_slow_round See ::lrint().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long int               lround(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long int __cdecl lround(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, with halfway cases rounded 
+ * away from zero.  If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ *
+ * \note_slow_round See ::lrintf().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long int               lroundf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long int __cdecl lroundf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, with halfway cases rounded 
+ * away from zero.  If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ *
+ * \note_slow_round See ::llrint().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llround(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long long int __cdecl llround(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, with halfway cases rounded 
+ * away from zero.  If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ *
+ * \note_slow_round See ::llrintf().
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llroundf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long long int __cdecl llroundf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round to nearest integer value in floating-point.
+ *
+ * Round \p x to the nearest integer value in floating-point format,
+ * with halfway cases rounded to the nearest even integer value.
+ *
+ * \return 
+ * Returns rounded integer value.
+ * - rint(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - rint(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - rint(NaN) returns NaN.
+ */
+#if defined(__CUDA_ARCH__) || defined(__DOXYGEN_ONLY__)
+/*
+ * We don't generate the declaration of rint for host compilation.
+ * This is acaully a workaround to compile the boost header file when
+ * Clang 3.8 is used as the host compiler. The boost header file has
+ * the following example code:
+ *   namespace NS { extern "C" { double rint(double); }
+ *   }
+ *
+ * After preprocessing, we get something like below:
+ *
+ * extern "C" { double rint(double x) throw(); }
+ * # 30 "/usr/include/math.h" 3
+ * extern "C" { double rint(double x) throw(); }
+ * namespace NS { extern "C" { double rint(double); } }
+ *
+ * Although GCC accepts this output, Clang 3.8 doesn't.
+ * Furthermore, we cannot change the boost header file by adding "throw()"
+ * to rint's declaration there. So, as a workaround, we just don't generate
+ * our re-declaration for the host compilation.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rint(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl rint(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#endif /* __CUDA_ARCH__ || __DOXYGEN_ONLY__ */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round input to nearest integer value in floating-point.
+ *
+ * Round \p x to the nearest integer value in floating-point format,
+ * with halfway cases rounded to the nearest even integer value.
+ *
+ * \return 
+ * Returns rounded integer value.
+ * - rintf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - rintf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - rintf(NaN) returns NaN.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rintf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl rintf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round input to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, 
+ * with halfway cases rounded to the nearest even integer value.
+ * If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long int               lrint(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long int __cdecl lrint(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round input to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, 
+ * with halfway cases rounded to the nearest even integer value.
+ * If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long int               lrintf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long int __cdecl lrintf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round input to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, 
+ * with halfway cases rounded to the nearest even integer value.
+ * If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llrint(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long long int __cdecl llrint(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round input to nearest integer value.
+ *
+ * Round \p x to the nearest integer value, 
+ * with halfway cases rounded to the nearest even integer value.
+ * If the result is outside the range of the return type,
+ * the behavior is undefined.
+ *
+ * \return 
+ * Returns rounded integer value.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ long long int          llrintf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP long long int __cdecl llrintf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Round the input argument to the nearest integer.
+ *
+ * Round argument \p x to an integer value in double precision floating-point format. Uses round to nearest rounding, with ties rounding to even.
+ *
+ * \return 
+ * - nearbyint(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - nearbyint(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - nearbyint(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 nearbyint(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl nearbyint(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Round the input argument to the nearest integer.
+ *
+ * Round argument \p x to an integer value in single precision floating-point format. Uses round to nearest rounding, with ties rounding to even.
+ *
+ * \return 
+ * - nearbyintf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - nearbyintf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - nearbyintf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  nearbyintf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl nearbyintf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate ceiling of the input argument.
+ *
+ * Compute the smallest integer value not less than \p x.
+ *
+ * \return
+ * Returns 
+ * \cuda_math_formula \lceil x \rceil \end_cuda_math_formula
+ expressed as a floating-point number.
+ * - ceil(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - ceil(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - ceil(NaN) returns NaN.
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl ceil(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Truncate input argument to the integral part.
+ *
+ * Round \p x to the nearest integer value that does not exceed \p x in 
+ * magnitude.
+ *
+ * \return 
+ * Returns truncated integer value.
+ * - trunc(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - trunc(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - trunc(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 trunc(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl trunc(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Truncate input argument to the integral part.
+ *
+ * Round \p x to the nearest integer value that does not exceed \p x in 
+ * magnitude.
+ *
+ * \return 
+ * Returns truncated integer value.
+ * - truncf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - truncf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - truncf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  truncf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl truncf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Compute the positive difference between \p x and \p y.
+ *
+ * Compute the positive difference between \p x and \p y.  The positive
+ * difference is \p x - \p y when \p x > \p y and +0 otherwise.
+ *
+ * \return 
+ * Returns the positive difference between \p x and \p y.
+ * - fdim(\p x, \p y) returns \p x - \p y if \p x > \p y.
+ * - fdim(\p x, \p y) returns +0 if \p x 
+ * \cuda_math_formula \leq \end_cuda_math_formula
+ \p y.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 fdim(double x, double y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl fdim(double x, double y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Compute the positive difference between \p x and \p y.
+ *
+ * Compute the positive difference between \p x and \p y.  The positive
+ * difference is \p x - \p y when \p x > \p y and +0 otherwise.
+ *
+ * \return 
+ * Returns the positive difference between \p x and \p y.
+ * - fdimf(\p x, \p y) returns \p x - \p y if \p x > \p y.
+ * - fdimf(\p x, \p y) returns +0 if \p x 
+ * \cuda_math_formula \leq \end_cuda_math_formula
+ \p y.
+ * - If either argument is NaN, NaN is returned.
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fdimf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl fdimf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the arc tangent of the ratio of first and second input arguments.
+ *
+ * Calculate the principal value of the arc tangent of the ratio of first
+ * and second input arguments \p y / \p x. The quadrant of the result is
+ * determined by the signs of inputs \p y and \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * , +
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * ].
+ * - atan2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , -0) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula.
+ * - atan2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , +0) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - atan2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p x) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * for \p x < 0.
+ * - atan2(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p x) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * for \p x > 0.
+ * - atan2(\p y,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula -\pi \end_cuda_math_formula
+ * /2 for \p y < 0.
+ * - atan2(\p y,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * /2 for \p y > 0.
+ * - atan2(
+ * \cuda_math_formula \pm y \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * for finite \p y > 0.
+ * - atan2(
+ * \cuda_math_formula \pm y \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * for finite \p y > 0.
+ * - atan2(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p x) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * /2 for finite \p x.
+ * - atan2(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 3\pi \end_cuda_math_formula
+ * /4.
+ * - atan2(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * /4.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl atan2(double y, double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the arc tangent of the input argument.
+ *
+ * Calculate the principal value of the arc tangent of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * /2, +
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * /2].
+ * - atan(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - atan(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * /2.
+ * - atan(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl atan(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the arc cosine of the input argument.
+ *
+ * Calculate the principal value of the arc cosine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [0, 
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * ] for \p x inside [-1, +1].
+ * - acos(1) returns +0.
+ * - acos(\p x) returns NaN for \p x outside [-1, +1].
+ * - acos(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl acos(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the arc sine of the input argument.
+ *
+ * Calculate the principal value of the arc sine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * /2, +
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * /2] for \p x inside [-1, +1].
+ * - asin(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - asin(\p x) returns NaN for \p x outside [-1, +1].
+ * - asin(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl asin(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the square root of the sum of squares of two arguments.
+ *
+ * Calculate the length of the hypotenuse of a right triangle whose two sides have lengths 
+ * \p x and \p y without undue overflow or underflow.
+ *
+ * \return Returns the length of the hypotenuse 
+ * \cuda_math_formula \sqrt{x^2+y^2} \end_cuda_math_formula. 
+ * - hypot(\p x,\p y), hypot(\p y,\p x), and hypot(\p x, \p -y) are equivalent.
+ * - hypot(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) is equivalent to fabs(\p x).
+ * - hypot(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,\p y) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+,
+ * even if \p y is a NaN.
+ * - hypot(NaN, \p y) returns NaN, when \p y is not \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ *
+ * \note_accuracy_double
+ */
+#if defined(_WIN32)
+#if defined(_MSC_VER) && _MSC_VER < 1900
+static __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double __CRTDECL hypot(double x, double y);
+#else
+extern _ACRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double __cdecl hypot(double x, double y);
+#endif
+#else /* _WIN32 */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double           hypot(double x, double y) __THROW;
+#endif /* _WIN32 */
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate one over the square root of the sum of squares of two arguments.
+ *
+ * Calculate one over the length of the hypotenuse of a right triangle whose two sides have 
+ * lengths \p x and \p y without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the hypotenuse 
+ * \cuda_math_formula \frac{1}{\sqrt{x^2+y^2}} \end_cuda_math_formula. 
+ * - rhypot(\p x,\p y), rhypot(\p y,\p x), and rhypot(\p x, \p -y) are equivalent.
+ * - rhypot(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,\p y) returns +0,
+ * even if \p y is a NaN.
+ * - rhypot(\cuda_math_formula \pm 0, \pm 0 \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - rhypot(NaN, \p y) returns NaN, when \p y is not \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                rhypot(double x, double y) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the square root of the sum of squares of two arguments.
+ *
+ * Calculates the length of the hypotenuse of a right triangle whose two sides have lengths 
+ * \p x and \p y without undue overflow or underflow.
+ *
+ * \return Returns the length of the hypotenuse 
+ * \cuda_math_formula \sqrt{x^2+y^2} \end_cuda_math_formula. 
+ * - hypotf(\p x,\p y), hypotf(\p y,\p x), and hypotf(\p x, \p -y) are equivalent.
+ * - hypotf(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) is equivalent to fabsf(\p x).
+ * - hypotf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,\p y) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+,
+ * even if \p y is a NaN.
+ * - hypotf(NaN, \p y) returns NaN, when \p y is not \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ *
+ * \note_accuracy_single
+ */
+#if defined(_WIN32)
+static __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __CRTDECL hypotf(float x, float y);
+#else /* _WIN32 */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float           hypotf(float x, float y) __THROW;
+#endif /* _WIN32 */
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate one over the square root of the sum of squares of two arguments.
+ *
+ * Calculates one over the length of the hypotenuse of a right triangle whose two sides have 
+ * lengths \p x and \p y without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the hypotenuse 
+ * \cuda_math_formula \frac{1}{\sqrt{x^2+y^2}} \end_cuda_math_formula. 
+ * - rhypotf(\p x,\p y), rhypotf(\p y,\p x), and rhypotf(\p x, \p -y) are equivalent.
+ * - rhypotf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,\p y) returns +0,
+ * even if \p y is a NaN.
+ * - rhypotf(\cuda_math_formula \pm 0, \pm 0 \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - rhypotf(NaN, \p y) returns NaN, when \p y is not \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float                 rhypotf(float x, float y) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the square root of the sum of squares of three coordinates of the argument.
+ *
+ * Calculate the length of three dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns the length of 3D vector
+ * \cuda_math_formula \sqrt{a^2+b^2+c^2} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns +0, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl norm3d(double a, double b, double c) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate one over the square root of the sum of squares of three coordinates.
+ *
+ * Calculate one over the length of three dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the 3D vector 
+ * \cuda_math_formula \frac{1}{\sqrt{a^2+b^2+c^2}} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +0 \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns \cuda_math_formula +\infty \end_cuda_math_formula, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                rnorm3d(double a, double b, double c) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the square root of the sum of squares of four coordinates of the argument.
+ *
+ * Calculate the length of four dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns the length of 4D vector
+ * \cuda_math_formula \sqrt{a^2+b^2+c^2+d^2} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns +0, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl norm4d(double a, double b, double c, double d) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate one over the square root of the sum of squares of four coordinates.
+ *
+ * Calculate one over the length of four dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the 3D vector 
+ * \cuda_math_formula \frac{1}{\sqrt{a^2+b^2+c^2+d^2}} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +0 \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns \cuda_math_formula +\infty \end_cuda_math_formula, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double rnorm4d(double a, double b, double c, double d) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the square root of the sum of squares of any number of coordinates.
+ *
+ * Calculate the length of a vector p, dimension of which is passed as an argument \p without undue overflow or underflow.
+ *
+ * \return Returns the length of the dim-D vector 
+ * \cuda_math_formula \sqrt{\sum_{i=0}^{dim-1} p_i^2} \end_cuda_math_formula.
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns +0, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_double
+ */
+__device__ __device_builtin__  double norm(int dim, double const * p) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the reciprocal of square root of the sum of squares of any number of coordinates.
+ *
+ * Calculates one over the length of vector \p p, dimension of which is passed as an argument, in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the vector
+ * \cuda_math_formula \frac{1}{\sqrt{\sum_{i=0}^{dim-1} p_i^2}} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +0 \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns \cuda_math_formula +\infty \end_cuda_math_formula, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double rnorm(int dim, double const * p) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the reciprocal of square root of the sum of squares of any number of coordinates.
+ *
+ * Calculates one over the length of vector \p p, dimension of which is passed as an argument, in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the vector
+ * \cuda_math_formula \frac{1}{\sqrt{\sum_{i=0}^{dim-1} p_i^2}} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +0 \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns \cuda_math_formula +\infty \end_cuda_math_formula, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_single
+ */
+
+extern __device__ __device_builtin__ float rnormf(int dim, float const * p) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the square root of the sum of squares of any number of coordinates.
+ *
+ * Calculates the length of a vector \p p, dimension of which is passed as an argument without undue overflow or underflow.
+ *
+ * \return Returns the length of the dim-D vector 
+ * \cuda_math_formula \sqrt{\sum_{i=0}^{dim-1} p_i^2} \end_cuda_math_formula.
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns +0, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_single
+ */
+__device__ __device_builtin__  float normf(int dim, float const * p) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the square root of the sum of squares of three coordinates of the argument.
+ *
+ * Calculates the length of three dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns the length of the 3D vector 
+ * \cuda_math_formula \sqrt{a^2+b^2+c^2} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns +0, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_single
+ */
+
+extern __device__ __device_builtin__ float norm3df(float a, float b, float c) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate one over the square root of the sum of squares of three coordinates.
+ *
+ * Calculates one over the length of three dimension vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the 3D vector
+ * \cuda_math_formula \frac{1}{\sqrt{a^2+b^2+c^2}} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +0 \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns \cuda_math_formula +\infty \end_cuda_math_formula, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float rnorm3df(float a, float b, float c) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the square root of the sum of squares of four coordinates of the argument.
+ *
+ * Calculates the length of four dimensional vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns the length of the 4D vector
+ * \cuda_math_formula \sqrt{a^2+b^2+c^2+d^2} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns +0, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float norm4df(float a, float b, float c, float d) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate one over the square root of the sum of squares of four coordinates.
+ *
+ * Calculates one over the length of four dimension vector in Euclidean space without undue overflow or underflow.
+ *
+ * \return Returns one over the length of the 3D vector
+ * \cuda_math_formula \frac{1}{\sqrt{a^2+b^2+c^2+d^2}} \end_cuda_math_formula. 
+ * - In the presence of an exactly infinite coordinate
+ * \cuda_math_formula +0 \end_cuda_math_formula
+ * is returned, even if there are NaNs.
+ * - returns \cuda_math_formula +\infty \end_cuda_math_formula, when all coordinates are \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - returns NaN, when at least one of the coordinates is NaN and none are infinite.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float rnorm4df(float a, float b, float c, float d) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the cube root of the input argument.
+ *
+ * Calculate the cube root of \p x, 
+ * \cuda_math_formula x^{1/3} \end_cuda_math_formula.
+ *
+ * \return 
+ * Returns 
+ * \cuda_math_formula x^{1/3} \end_cuda_math_formula.
+ * - cbrt(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - cbrt(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - cbrt(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 cbrt(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl cbrt(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the cube root of the input argument.
+ *
+ * Calculate the cube root of \p x, 
+ * \cuda_math_formula x^{1/3} \end_cuda_math_formula.
+ *
+ * \return 
+ * Returns 
+ * \cuda_math_formula x^{1/3} \end_cuda_math_formula.
+ * - cbrtf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - cbrtf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - cbrtf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cbrtf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl cbrtf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate reciprocal cube root function.
+ *
+ * Calculate reciprocal cube root function of \p x.
+ *
+ * \return 
+ * - rcbrt(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - rcbrt(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - rcbrt(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rcbrt(double x);
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate reciprocal cube root function.
+ *
+ * Calculate reciprocal cube root function of \p x.
+ *
+ * \return 
+ * - rcbrtf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - rcbrtf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - rcbrtf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rcbrtf(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the sine of the input argument 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.
+ *
+ * Calculate the sine of \p x
+ * \cuda_math_formula \times \pi \end_cuda_math_formula
+ *  (measured in radians), 
+ * where \p x is the input argument.
+ *
+ * \return 
+ * - sinpi(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sinpi(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - sinpi(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 sinpi(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the sine of the input argument 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.
+ *
+ * Calculate the sine of \p x
+ * \cuda_math_formula \times \pi \end_cuda_math_formula
+ *  (measured in radians), 
+ * where \p x is the input argument.
+ *
+ * \return 
+ * - sinpif(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sinpif(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - sinpif(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinpif(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the cosine of the input argument 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.
+ *
+ * Calculate the cosine of \p x
+ * \cuda_math_formula \times \pi \end_cuda_math_formula
+ *  (measured in radians), 
+ * where \p x is the input argument.
+ *
+ * \return 
+ * - cospi(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - cospi(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - cospi(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 cospi(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the cosine of the input argument 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.
+ *
+ * Calculate the cosine of \p x
+ * \cuda_math_formula \times \pi \end_cuda_math_formula
+ *  (measured in radians),
+ * where \p x is the input argument.
+ *
+ * \return 
+ * - cospif(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - cospif(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - cospif(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cospif(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief  Calculate the sine and cosine of the first input argument 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.
+ *
+ * Calculate the sine and cosine of the first input argument, \p x (measured in radians), 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.  The results for sine and cosine are written into the
+ * second argument, \p sptr, and, respectively, third argument, \p cptr.
+ *
+ * \see ::sinpi() and ::cospi().
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   sincospi(double x, double *sptr, double *cptr);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief  Calculate the sine and cosine of the first input argument 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.
+ *
+ * Calculate the sine and cosine of the first input argument, \p x (measured in radians), 
+ * \cuda_math_formula \times \pi \end_cuda_math_formula.  The results for sine and cosine are written into the
+ * second argument, \p sptr, and, respectively, third argument, \p cptr.
+ *
+ * \see ::sinpif() and ::cospif().
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ void                   sincospif(float x, float *sptr, float *cptr);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of first argument to the power of second argument.
+ *
+ * Calculate the value of \p x to the power of \p y.
+ *
+ * \return 
+ * - pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *  for \p y an odd integer less than 0.
+ * - pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y less than 0 and not an odd integer.
+ * - pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  for \p y an odd integer greater than 0.
+ * - pow(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns +0 for \p y > 0 and not an odd integer.
+ * - pow(-1, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 1.
+ * - pow(+1, \p y) returns 1 for any \p y, even a NaN.
+ * - pow(\p x, 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1 for any \p x, even a NaN.
+ * - pow(\p x, \p y) returns a NaN for finite \p x < 0 and finite non-integer \p y.
+ * - pow(\p x, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for 
+ * \cuda_math_formula | x | < 1 \end_cuda_math_formula.
+ * - pow(\p x, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0 for 
+ * \cuda_math_formula | x | > 1 \end_cuda_math_formula.
+ * - pow(\p x, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0 for 
+ * \cuda_math_formula | x | < 1 \end_cuda_math_formula.
+ * - pow(\p x, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for 
+ * \cuda_math_formula | x | > 1 \end_cuda_math_formula.
+ * - pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns -0 for \p y an odd integer less than 0.
+ * - pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns +0 for \p y < 0 and not an odd integer.
+ * - pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ *  for \p y an odd integer greater than 0.
+ * - pow(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y > 0 and not an odd integer.
+ * - pow(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * , \p y) returns +0 for \p y < 0.
+ * - pow(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y > 0.
+ * - pow(\p x, \p y) returns NaN if either \p x or \p y or both are NaN and \p x \cuda_math_formula \neq \end_cuda_math_formula +1 and \p y \cuda_math_formula \neq\pm 0 \end_cuda_math_formula.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl pow(double x, double y) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Break down the input argument into fractional and integral parts.
+ *
+ * Break down the argument \p x into fractional and integral parts. The 
+ * integral part is stored in the argument \p iptr.
+ * Fractional and integral parts are given the same sign as the argument \p x.
+ *
+ * \return 
+ * - modf(
+ * \cuda_math_formula \pm x \end_cuda_math_formula
+ * , \p iptr) returns a result with the same sign as \p x.
+ * - modf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p iptr) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  and stores 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *   in the object pointed to by \p iptr.
+ * - modf(NaN, \p iptr) stores a NaN in the object pointed to by \p iptr and returns a NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl modf(double x, double *iptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the double-precision floating-point remainder of \p x / \p y.
+ *
+ * Calculate the double-precision floating-point remainder of \p x / \p y.
+ * The floating-point remainder of the division operation \p x / \p y calculated
+ * by this function is exactly the value <tt>x - n*y</tt>, where \p n is \p x / \p y with its fractional part truncated.
+ * The computed value will have the same sign as \p x, and its magnitude will be less than the magnitude of \p y.
+ *
+ * \return
+ * - Returns the floating-point remainder of \p x / \p y.
+ * - fmod(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  if \p y is not zero.
+ * - fmod(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns \p x if \p x is finite.
+ * - fmod(\p x, \p y) returns NaN if \p x is 
+ * \cuda_math_formula \pm\infty \end_cuda_math_formula
+ *  or \p y is zero.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double         __cdecl fmod(double x, double y) __THROW;
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Compute double-precision floating-point remainder.
+ *
+ * Compute double-precision floating-point remainder \p r of dividing 
+ * \p x by \p y for nonzero \p y. Thus 
+ * \cuda_math_formula  r = x - n y \end_cuda_math_formula.
+ * The value \p n is the integer value nearest 
+ * \cuda_math_formula  \frac{x}{y}  \end_cuda_math_formula. 
+ * In the case when 
+ * \cuda_math_formula  | n -\frac{x}{y} | = \frac{1}{2}  \end_cuda_math_formula
+ * , the
+ * even \p n value is chosen.
+ *
+ * \return 
+ * - remainder(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns NaN.
+ * - remainder(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p y) returns NaN.
+ * - remainder(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns \p x for finite \p x.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 remainder(double x, double y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl remainder(double x, double y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Compute single-precision floating-point remainder.
+ *
+ * Compute single-precision floating-point remainder \p r of dividing 
+ * \p x by \p y for nonzero \p y. Thus 
+ * \cuda_math_formula  r = x - n y \end_cuda_math_formula.
+ * The value \p n is the integer value nearest 
+ * \cuda_math_formula  \frac{x}{y}  \end_cuda_math_formula. 
+ * In the case when 
+ * \cuda_math_formula  | n -\frac{x}{y} | = \frac{1}{2}  \end_cuda_math_formula
+ * , the
+ * even \p n value is chosen.
+ *
+ * \return 
+ * - remainderf(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns NaN.
+ * - remainderf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p y) returns NaN.
+ * - remainderf(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns \p x for finite \p x.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  remainderf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl remainderf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Compute double-precision floating-point remainder and part of quotient.
+ *
+ * Compute a double-precision floating-point remainder in the same way as the
+ * ::remainder() function. Argument \p quo returns part of quotient upon 
+ * division of \p x by \p y. Value \p quo has the same sign as 
+ * \cuda_math_formula  \frac{x}{y}  \end_cuda_math_formula
+ * and may not be the exact quotient but agrees with the exact quotient
+ * in the low order 3 bits.
+ *
+ * \return 
+ * Returns the remainder.
+ * - remquo(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points.
+ * - remquo(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p y, \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points.
+ * - remquo(\p x, \p y, \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points if either of \p x or \p y is NaN.
+ * - remquo(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p quo) returns \p x and stores zero
+ * in the location to which \p quo points for finite \p x.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 remquo(double x, double y, int *quo) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl remquo(double x, double y, int *quo);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Compute single-precision floating-point remainder and part of quotient.
+ *
+ * Compute a single-precision floating-point remainder in the same way as the 
+ * ::remainderf() function. Argument \p quo returns part of quotient upon 
+ * division of \p x by \p y. Value \p quo has the same sign as 
+ * \cuda_math_formula  \frac{x}{y}  \end_cuda_math_formula
+ * and may not be the exact quotient but agrees with the exact quotient
+ * in the low order 3 bits.
+ *
+ * \return 
+ * Returns the remainder.
+ * - remquof(\p x,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points.
+ * - remquof(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p y, \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points.
+ * - remquof(\p x, \p y, \p quo) returns NaN
+ * and stores an unspecified value in the 
+ * location to which \p quo points if either of \p x or \p y is NaN.
+ * - remquof(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p quo) returns \p x and stores zero
+ * in the location to which \p quo points for finite \p x.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  remquof(float x, float y, int *quo) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl remquof(float x, float y, int *quo);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the first kind of order 0 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order 0 for
+ * the input argument \p x, 
+ * \cuda_math_formula J_0(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order 0.
+ * - j0(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns +0.
+ * - j0(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl j0(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the first kind of order 0 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order 0 for
+ * the input argument \p x, 
+ * \cuda_math_formula J_0(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order 0.
+ * - j0f(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns +0.
+ * - j0f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  j0f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the first kind of order 1 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order 1 for
+ * the input argument \p x, 
+ * \cuda_math_formula J_1(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order 1.
+ * - j1(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - j1(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - j1(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl j1(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the first kind of order 1 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order 1 for
+ * the input argument \p x, 
+ * \cuda_math_formula J_1(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order 1.
+ * - j1f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - j1f(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - j1f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  j1f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the first kind of order n for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order \p n for
+ * the input argument \p x, 
+ * \cuda_math_formula J_n(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order \p n.
+ * - jn(\p n, NaN) returns NaN.
+ * - jn(\p n, \p x) returns NaN for \p n < 0.
+ * - jn(\p n, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl jn(int n, double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the first kind of order n for the input argument.
+ *
+ * Calculate the value of the Bessel function of the first kind of order \p n for
+ * the input argument \p x, 
+ * \cuda_math_formula J_n(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the first kind of order \p n.
+ * - jnf(\p n, NaN) returns NaN.
+ * - jnf(\p n, \p x) returns NaN for \p n < 0.
+ * - jnf(\p n, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  jnf(int n, float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the second kind of order 0 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order 0 for
+ * the input argument \p x, 
+ * \cuda_math_formula Y_0(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order 0.
+ * - y0(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - y0(\p x) returns NaN for \p x < 0.
+ * - y0(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - y0(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl y0(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the second kind of order 0 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order 0 for
+ * the input argument \p x, 
+ * \cuda_math_formula Y_0(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order 0.
+ * - y0f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - y0f(\p x) returns NaN for \p x < 0.
+ * - y0f(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - y0f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  y0f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the second kind of order 1 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order 1 for
+ * the input argument \p x, 
+ * \cuda_math_formula Y_1(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order 1.
+ * - y1(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - y1(\p x) returns NaN for \p x < 0.
+ * - y1(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - y1(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl y1(double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the second kind of order 1 for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order 1 for
+ * the input argument \p x, 
+ * \cuda_math_formula Y_1(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order 1.
+ * - y1f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - y1f(\p x) returns NaN for \p x < 0.
+ * - y1f(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - y1f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  y1f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the Bessel function of the second kind of order n for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order \p n for
+ * the input argument \p x, 
+ * \cuda_math_formula Y_n(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order \p n.
+ * - yn(\p n, \p x) returns NaN for \p n < 0.
+ * - yn(\p n, 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - yn(\p n, \p x) returns NaN for \p x < 0.
+ * - yn(\p n, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - yn(\p n, NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl yn(int n, double x) __THROW;
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the Bessel function of the second kind of order n for the input argument.
+ *
+ * Calculate the value of the Bessel function of the second kind of order \p n for
+ * the input argument \p x, 
+ * \cuda_math_formula Y_n(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the Bessel function of the second kind of order \p n.
+ * - ynf(\p n, \p x) returns NaN for \p n < 0.
+ * - ynf(\p n, 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - ynf(\p n, \p x) returns NaN for \p x < 0.
+ * - ynf(\p n, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - ynf(\p n, NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  ynf(int n, float x) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the regular modified cylindrical Bessel function of order 0 for the input argument.
+ *
+ * Calculate the value of the regular modified cylindrical Bessel function of order 0 for
+ * the input argument \p x, 
+ * \cuda_math_formula I_0(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the regular modified cylindrical Bessel function of order 0.
+ * - cyl_bessel_i0(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns +1.
+ * - cyl_bessel_i0(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - cyl_bessel_i0(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl cyl_bessel_i0(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the regular modified cylindrical Bessel function of order 0 for the input argument.
+ *
+ * Calculate the value of the regular modified cylindrical Bessel function of order 0 for
+ * the input argument \p x, 
+ * \cuda_math_formula I_0(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the regular modified cylindrical Bessel function of order 0.
+ * - cyl_bessel_i0f(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns +1.
+ * - cyl_bessel_i0f(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - cyl_bessel_i0f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float                  cyl_bessel_i0f(float x) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the value of the regular modified cylindrical Bessel function of order 1 for the input argument.
+ *
+ * Calculate the value of the regular modified cylindrical Bessel function of order 1 for
+ * the input argument \p x, 
+ * \cuda_math_formula I_1(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the regular modified cylindrical Bessel function of order 1.
+ * - cyl_bessel_i1(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - cyl_bessel_i1(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - cyl_bessel_i1(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl cyl_bessel_i1(double x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of the regular modified cylindrical Bessel function of order 1 for the input argument.
+ *
+ * Calculate the value of the regular modified cylindrical Bessel function of order 1 for
+ * the input argument \p x, 
+ * \cuda_math_formula I_1(x) \end_cuda_math_formula.
+ *
+ * \return
+ * Returns the value of the regular modified cylindrical Bessel function of order 1.
+ * - cyl_bessel_i1f(\cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - cyl_bessel_i1f(\cuda_math_formula \pm\infty \end_cuda_math_formula) returns \cuda_math_formula \pm\infty \end_cuda_math_formula.
+ * - cyl_bessel_i1f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __device__ __device_builtin__ float                  cyl_bessel_i1f(float x) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the error function of the input argument.
+ *
+ * Calculate the value of the error function for the input argument \p x,
+ * \cuda_math_formula \frac{2}{\sqrt \pi} \int_0^x e^{-t^2} dt \end_cuda_math_formula.
+ *
+ * \return 
+ * - erf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - erf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula.
+ * - erf(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 erf(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl erf(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the error function of the input argument.
+ *
+ * Calculate the value of the error function for the input argument \p x,
+ * \cuda_math_formula \frac{2}{\sqrt \pi} \int_0^x e^{-t^2} dt \end_cuda_math_formula.
+ *
+ * \return  
+ * - erff(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - erff(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula.
+ * - erff(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  erff(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl erff(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the inverse error function of the input argument.
+ *
+ * Calculate the inverse error function
+ * \cuda_math_formula \operatorname{erf}^{-1} \end_cuda_math_formula
+ * (\p x), of the input argument \p x in the interval [-1, 1].
+ *
+ * \return
+ * - erfinv(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - erfinv(1) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - erfinv(-1) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - erfinv(\p x) returns NaN for \p x outside [-1, +1].
+ * - erfinv(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 erfinv(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the inverse error function of the input argument.
+ *
+ * Calculate the inverse error function
+ * \cuda_math_formula \operatorname{erf}^{-1} \end_cuda_math_formula
+ * (\p x), of the input argument \p x in the interval [-1, 1].
+ *
+ * \return 
+ * - erfinvf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - erfinvf(1) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - erfinvf(-1) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - erfinvf(\p x) returns NaN for \p x outside [-1, +1].
+ * - erfinvf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  erfinvf(float x);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the complementary error function of the input argument.
+ *
+ * Calculate the complementary error function of the input argument \p x,
+ * 1 - erf(\p x).
+ *
+ * \return 
+ * - erfc(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 2.
+ * - erfc(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - erfc(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 erfc(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl erfc(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the complementary error function of the input argument.
+ *
+ * Calculate the complementary error function of the input argument \p x,
+ * 1 - erf(\p x).
+ *
+ * \return 
+ * - erfcf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 2.
+ * - erfcf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - erfcf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  erfcf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl erfcf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the natural logarithm of the absolute value of the gamma function of the input argument.
+ *
+ * Calculate the natural logarithm of the absolute value of the gamma function of the input argument \p x, namely the value of
+ * \cuda_math_formula \log_{e}\left|\Gamma(x)\right| \end_cuda_math_formula
+ *
+ * \return 
+ * - lgamma(1) returns +0.
+ * - lgamma(2) returns +0.
+ * - lgamma(\p x) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  if \p x 
+ * \cuda_math_formula \leq \end_cuda_math_formula
+ 0 and \p x is an integer.
+ * - lgamma(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - lgamma(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - lgamma(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 lgamma(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl lgamma(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the inverse complementary error function of the input argument.
+ *
+ * Calculate the inverse complementary error function
+ * \cuda_math_formula \operatorname{erfc}^{-1} \end_cuda_math_formula
+ * (\p x), of the input argument \p x in the interval [0, 2].
+ *
+ * \return 
+ * - erfcinv(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - erfcinv(2) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - erfcinv(\p x) returns NaN for \p x outside [0, 2].
+ * - erfcinv(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 erfcinv(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the inverse complementary error function of the input argument.
+ *
+ * Calculate the inverse complementary error function
+ * \cuda_math_formula \operatorname{erfc}^{-1} \end_cuda_math_formula
+ * (\p x), of the input argument \p x in the interval [0, 2].
+ *
+ * \return 
+ * - erfcinvf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - erfcinvf(2) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - erfcinvf(\p x) returns NaN for \p x outside [0, 2].
+ * - erfcinvf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  erfcinvf(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the inverse of the standard normal cumulative distribution function.
+ *
+ * Calculate the inverse of the standard normal cumulative distribution function for input argument \p x,
+ * \cuda_math_formula \Phi^{-1}(x) \end_cuda_math_formula. The function is defined for input values in the interval 
+ * \cuda_math_formula (0, 1) \end_cuda_math_formula.
+ *
+ * \return 
+ * - normcdfinv(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - normcdfinv(1) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - normcdfinv(\p x) returns NaN
+ *  if \p x is not in the interval [0,1].
+ * - normcdfinv(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 normcdfinv(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the inverse of the standard normal cumulative distribution function.
+ *
+ * Calculate the inverse of the standard normal cumulative distribution function for input argument \p x,
+ * \cuda_math_formula \Phi^{-1}(x) \end_cuda_math_formula. The function is defined for input values in the interval 
+ * \cuda_math_formula (0, 1) \end_cuda_math_formula.
+ *
+ * \return 
+ * - normcdfinvf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - normcdfinvf(1) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - normcdfinvf(\p x) returns NaN
+ *  if \p x is not in the interval [0,1].
+ * - normcdfinvf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  normcdfinvf(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the standard normal cumulative distribution function.
+ *
+ * Calculate the cumulative distribution function of the standard normal distribution for input argument \p x,
+ * \cuda_math_formula \Phi(x) \end_cuda_math_formula.
+ *
+ * \return 
+ * - normcdf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 1.
+ * - normcdf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - normcdf(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 normcdf(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the standard normal cumulative distribution function.
+ *
+ * Calculate the cumulative distribution function of the standard normal distribution for input argument \p x,
+ * \cuda_math_formula \Phi(x) \end_cuda_math_formula.
+ *
+ * \return 
+ * - normcdff(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 1.
+ * - normcdff(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0
+ * - normcdff(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  normcdff(float x);
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the scaled complementary error function of the input argument.
+ *
+ * Calculate the scaled complementary error function of the input argument \p x,
+ * \cuda_math_formula e^{x^2}\cdot \operatorname{erfc}(x) \end_cuda_math_formula.
+ *
+ * \return 
+ * - erfcx(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - erfcx(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - erfcx(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 erfcx(double x);
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the scaled complementary error function of the input argument.
+ *
+ * Calculate the scaled complementary error function of the input argument \p x,
+ * \cuda_math_formula e^{x^2}\cdot \operatorname{erfc}(x) \end_cuda_math_formula.
+ *
+ * \return 
+ * - erfcxf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - erfcxf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - erfcxf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  erfcxf(float x);
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the natural logarithm of the absolute value of the gamma function of the input argument.
+ *
+ * Calculate the natural logarithm of the absolute value of the gamma function of the input argument \p x, namely the value of
+ * \cuda_math_formula \log_{e}\left|\Gamma(x)\right| \end_cuda_math_formula
+ *
+ * \return 
+ * - lgammaf(1) returns +0.
+ * - lgammaf(2) returns +0.
+ * - lgammaf(\p x) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  if \p x
+ * \cuda_math_formula \leq \end_cuda_math_formula
+ *  0 and \p x is an integer.
+ * - lgammaf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - lgammaf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - lgammaf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  lgammaf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl lgammaf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the gamma function of the input argument.
+ *
+ * Calculate the gamma function of the input argument \p x, namely the value of
+ * \cuda_math_formula \Gamma(x) \end_cuda_math_formula.
+ *
+ * \return 
+ * - tgamma(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - tgamma(\p x) returns NaN if \p x < 0 and \p x is an integer.
+ * - tgamma(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - tgamma(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - tgamma(NaN) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 tgamma(double x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl tgamma(double x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the gamma function of the input argument.
+ *
+ * Calculate the gamma function of the input argument \p x, namely the value of
+ * \cuda_math_formula \Gamma(x) \end_cuda_math_formula.
+ *
+ * \return 
+ * - tgammaf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - tgammaf(\p x) returns NaN if \p x < 0  and \p x is an integer.
+ * - tgammaf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - tgammaf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - tgammaf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  tgammaf(float x) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl tgammaf(float x);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/** \ingroup CUDA_MATH_DOUBLE
+ * \brief Create value with given magnitude, copying sign of second value.
+ *
+ * Create a floating-point value with the magnitude \p x and the sign of \p y.
+ *
+ * \return
+ * - a value with the magnitude of \p x and the sign of \p y.
+ * - copysign(\p NaN, \p y) returns a \p NaN with the sign of \p y.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 copysign(double x, double y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl copysign(double x, double y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/** \ingroup CUDA_MATH_SINGLE
+ * \brief Create value with given magnitude, copying sign of second value.
+ *
+ * Create a floating-point value with the magnitude \p x and the sign of \p y.
+ *
+ * \return
+ * - a value with the magnitude of \p x and the sign of \p y.
+ * - copysignf(\p NaN, \p y) returns a \p NaN with the sign of \p y.
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  copysignf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl copysignf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Return next representable double-precision floating-point value after argument \p x in the direction of \p y.
+ *
+ * Calculate the next representable double-precision floating-point value
+ * following \p x in the direction of \p y. For example, if \p y is greater than \p x, ::nextafter()
+ * returns the smallest representable number greater than \p x
+ *
+ * \return 
+ * - nextafter(\p x, \p y) = \p y if \p x equals \p y.
+ * - nextafter(\p x, \p y) = \p NaN if either \p x or \p y are \p NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 nextafter(double x, double y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl nextafter(double x, double y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Return next representable single-precision floating-point value after argument \p x in the direction of \p y.
+ *
+ * Calculate the next representable single-precision floating-point value
+ * following \p x in the direction of \p y. For example, if \p y is greater than \p x, ::nextafterf()
+ * returns the smallest representable number greater than \p x
+ *
+ * \return 
+ * - nextafterf(\p x, \p y) = \p y if \p x equals \p y.
+ * - nextafterf(\p x, \p y) = \p NaN if either \p x or \p y are \p NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  nextafterf(float x, float y) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl nextafterf(float x, float y);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Returns "Not a Number" value.
+ *
+ * Return a representation of a quiet NaN. Argument \p tagp selects one of the possible representations.
+ *
+ * \return 
+ * - nan(\p tagp) returns NaN.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 nan(const char *tagp) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl nan(const char *tagp);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Returns "Not a Number" value
+ *
+ * Return a representation of a quiet NaN. Argument \p tagp selects one of the possible representations.
+ *
+ * \return 
+ * - nanf(\p tagp) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  nanf(const char *tagp) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl nanf(const char *tagp);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* namespace std */
+#endif
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isinff(float) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isnanf(float) __THROW;
+
+
+#if defined(__APPLE__)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isfinited(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isfinitef(float) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __signbitd(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isnand(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isinfd(double) __THROW;
+#else /* __APPLE__ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __finite(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __finitef(float) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __signbit(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isnan(double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isinf(double) __THROW;
+#endif /* __APPLE__ */
+
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __signbitf(float) __THROW;
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ *
+ * Compute the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation. After computing the value
+ * to infinite precision, the value is rounded once using round-to-nearest,
+ * ties-to-even rounding mode.
+ *
+ * \return
+ * Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - fma(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - fma(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - fma(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - fma(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - fma(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - fma(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - fma(\p x, \p y, \p z) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_double
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 fma(double x, double y, double z) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP double __cdecl fma(double x, double y, double z);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Compute 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ *
+ * Compute the value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single ternary operation. After computing the value
+ * to infinite precision, the value is rounded once using round-to-nearest,
+ * ties-to-even rounding mode.
+ *
+ * \return
+ * Returns the rounded value of 
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ *  as a single operation.
+ * - fmaf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - fmaf(\p x, \p y, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns NaN if 
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ *  is an exact 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - fmaf(\p x, \p y, \cuda_math_formula \pm 0 \end_cuda_math_formula) returns \cuda_math_formula \pm 0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - fmaf(\p x, \p y, \cuda_math_formula \mp 0 \end_cuda_math_formula) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y \end_cuda_math_formula is exact \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - fmaf(\p x, \p y, \p z) returns \cuda_math_formula +0 \end_cuda_math_formula if \cuda_math_formula x \times y + z \end_cuda_math_formula is exactly zero and \cuda_math_formula z \neq 0 \end_cuda_math_formula.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single
+ */
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fmaf(float x, float y, float z) __THROW;
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ __CUDA_MATH_CRTIMP float  __cdecl fmaf(float x, float y, float z);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+
+
+/* these are here to avoid warnings on the call graph.
+   long double is not supported on the device */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __signbitl(long double) __THROW;
+#if defined(__APPLE__)
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isfinite(long double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isinf(long double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isnan(long double) __THROW;
+#else /* __APPLE__ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __finitel(long double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isinfl(long double) __THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int                    __isnanl(long double) __THROW;
+#endif /* __APPLE__ */
+
+#if defined(_WIN32) && ( defined(_M_AMD64) || defined(_M_ARM64) )
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl acosf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl asinf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl atanf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl atan2f(float, float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl cosf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl sinf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl tanf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl coshf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl sinhf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl tanhf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl expf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl logf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl log10f(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl modff(float, float*) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl powf(float, float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl sqrtf(float) __THROW;         
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl ceilf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl floorf(float) __THROW;
+extern __CUDA_MATH_CRTIMP __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float __cdecl fmodf(float, float) __THROW;
+#else /* _WIN32 && (_M_AMD64 || _M_ARM64) */
+
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+namespace std {
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the arc cosine of the input argument.
+ *
+ * Calculate the principal value of the arc cosine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [0, 
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * ] for \p x inside [-1, +1].
+ * - acosf(1) returns +0.
+ * - acosf(\p x) returns NaN for \p x outside [-1, +1].
+ * - acosf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  acosf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the arc sine of the input argument.
+ *
+ * Calculate the principal value of the arc sine of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * , +
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * ] for \p x inside [-1, +1].
+ * - asinf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - asinf(\p x) returns NaN for \p x outside [-1, +1].
+ * - asinf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  asinf(float x) __THROW;
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the arc tangent of the input argument.
+ *
+ * Calculate the principal value of the arc tangent of the input argument \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * , +
+ * \cuda_math_formula \pi/2 \end_cuda_math_formula
+ * ].
+ * - atanf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - atanf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * /2.
+ * - atanf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  atanf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the arc tangent of the ratio of first and second input arguments.
+ *
+ * Calculate the principal value of the arc tangent of the ratio of first
+ * and second input arguments \p y / \p x. The quadrant of the result is 
+ * determined by the signs of inputs \p y and \p x.
+ *
+ * \return 
+ * Result will be in radians, in the interval [-
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * , +
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * ].
+ * - atan2f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , -0) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula.
+ * - atan2f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , +0) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - atan2f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p x) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * for \p x < 0.
+ * - atan2f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p x) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * for \p x > 0.
+ * - atan2f(\p y,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula -\pi \end_cuda_math_formula
+ * /2 for \p y < 0.
+ * - atan2f(\p y,
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pi \end_cuda_math_formula
+ * /2 for \p y > 0.
+ * - atan2f(
+ * \cuda_math_formula \pm y \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * for finite \p y > 0.
+ * - atan2f(
+ * \cuda_math_formula \pm y \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * for finite \p y > 0.
+ * - atan2f(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p x) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * /2 for finite \p x.
+ * - atan2f(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 3\pi \end_cuda_math_formula
+ * /4.
+ * - atan2f(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ,
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \pi \end_cuda_math_formula
+ * /4.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  atan2f(float y, float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the cosine of the input argument.
+ *
+ * Calculate the cosine of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - cosf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - cosf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - cosf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cosf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the sine of the input argument.
+ *
+ * Calculate the sine of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - sinf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sinf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - sinf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the tangent of the input argument.
+ *
+ * Calculate the tangent of the input argument \p x (measured in radians).
+ *
+ * \return 
+ * - tanf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - tanf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns NaN.
+ * - tanf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  tanf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the hyperbolic cosine of the input argument.
+ *
+ * Calculate the hyperbolic cosine of the input argument \p x.
+ *
+ * \return 
+ * - coshf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - coshf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - coshf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  coshf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the hyperbolic sine of the input argument.
+ *
+ * Calculate the hyperbolic sine of the input argument \p x.
+ *
+ * \return 
+ * - sinhf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sinhf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - sinhf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinhf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the hyperbolic tangent of the input argument.
+ *
+ * Calculate the hyperbolic tangent of the input argument \p x.
+ *
+ * \return 
+ * - tanhf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - tanhf( 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula \pm 1 \end_cuda_math_formula.
+ * - tanhf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  tanhf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the natural logarithm of the input argument.
+ *
+ * Calculate the natural logarithm of the input argument \p x.
+ *
+ * \return 
+ * - logf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - logf(1) returns +0.
+ * - logf(\p x) returns NaN for \p x < 0.
+ * - logf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - logf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  logf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument.
+ *
+ * Calculate
+ * \cuda_math_formula e^x \end_cuda_math_formula
+,
+ * the base 
+ * \cuda_math_formula e \end_cuda_math_formula
+ *  exponential of the input argument \p x.
+ *
+ * \return
+ * - expf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1.
+ * - expf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0.
+ * - expf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - expf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  expf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the base 10 logarithm of the input argument.
+ *
+ * Calculate the base 10 logarithm of the input argument \p x.
+ *
+ * \return 
+ * - log10f(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula.
+ * - log10f(1) returns +0.
+ * - log10f(\p x) returns NaN for \p x < 0.
+ * - log10f(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - log10f(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  log10f(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Break down the input argument into fractional and integral parts.
+ *
+ * Break down the argument \p x into fractional and integral parts. The integral part is stored in the argument \p iptr.
+ * Fractional and integral parts are given the same sign as the argument \p x.
+ *
+ * \return 
+ * - modff(
+ * \cuda_math_formula \pm x \end_cuda_math_formula
+ * , \p iptr) returns a result with the same sign as \p x.
+ * - modff(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * , \p iptr) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  and stores 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *   in the object pointed to by \p iptr.
+ * - modff(NaN, \p iptr) stores a NaN in the object pointed to by \p iptr and returns a NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  modff(float x, float *iptr) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the value of first argument to the power of second argument.
+ *
+ * Calculate the value of \p x to the power of \p y.
+ *
+ * \return 
+ * - powf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ *  for \p y an odd integer less than 0.
+ * - powf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y less than 0 and not an odd integer.
+ * - powf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  for \p y an odd integer greater than 0.
+ * - powf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns +0 for \p y > 0 and not an odd integer.
+ * - powf(-1, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 1.
+ * - powf(+1, \p y) returns 1 for any \p y, even a NaN.
+ * - powf(\p x, 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 1 for any \p x, even a NaN.
+ * - powf(\p x, \p y) returns a NaN for finite \p x < 0 and finite non-integer \p y.
+ * - powf(\p x, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for 
+ * \cuda_math_formula | x | < 1 \end_cuda_math_formula.
+ * - powf(\p x, 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * ) returns +0 for 
+ * \cuda_math_formula | x | > 1 \end_cuda_math_formula.
+ * - powf(\p x, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns +0 for 
+ * \cuda_math_formula | x | < 1 \end_cuda_math_formula.
+ * - powf(\p x, 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for 
+ * \cuda_math_formula | x | > 1 \end_cuda_math_formula.
+ * - powf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns -0 for \p y an odd integer less than 0.
+ * - powf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns +0 for \p y < 0 and not an odd integer.
+ * - powf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ *  for \p y an odd integer greater than 0.
+ * - powf(
+ * \cuda_math_formula -\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y > 0 and not an odd integer.
+ * - powf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * , \p y) returns +0 for \p y < 0.
+ * - powf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ *  for \p y > 0.
+ * - powf(\p x, \p y) returns NaN if either \p x or \p y or both are NaN and \p x \cuda_math_formula \neq \end_cuda_math_formula +1 and \p y \cuda_math_formula \neq\pm 0 \end_cuda_math_formula.
+ *
+ * \note_accuracy_single
+ * \note_fastmath
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  powf(float x, float y) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the square root of the input argument.
+ *
+ * Calculate the nonnegative square root of \p x, 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ *
+ * \return 
+ * Returns 
+ * \cuda_math_formula \sqrt{x} \end_cuda_math_formula.
+ * - sqrtf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - sqrtf(
+ * \cuda_math_formula +\infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula +\infty \end_cuda_math_formula.
+ * - sqrtf(\p x) returns NaN if \p x is less than 0.
+ * - sqrtf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sqrtf(float x) __THROW;         
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate ceiling of the input argument.
+ *
+ * Compute the smallest integer value not less than \p x.
+ *
+ * \return
+ * Returns 
+ * \cuda_math_formula \lceil x \rceil \end_cuda_math_formula
+ *  expressed as a floating-point number.
+ * - ceilf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - ceilf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - ceilf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  ceilf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the largest integer less than or equal to \p x.
+ * 
+ * Calculate the largest integer value which is less than or equal to \p x.
+ * 
+ * \return
+ * Returns 
+ * \cuda_math_formula \lfloor x \rfloor \end_cuda_math_formula
+ *  expressed as a floating-point number.
+ * - floorf(
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula.
+ * - floorf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * ) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula.
+ * - floorf(NaN) returns NaN.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  floorf(float x) __THROW;
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the floating-point remainder of \p x / \p y.
+ *
+ * Calculate the floating-point remainder of \p x / \p y.
+ * The floating-point remainder of the division operation \p x / \p y calculated
+ * by this function is exactly the value <tt>x - n*y</tt>, where \p n is \p x / \p y with its fractional part truncated.
+ * The computed value will have the same sign as \p x, and its magnitude will be less than the magnitude of \p y.
+ * \return
+ * - Returns the floating-point remainder of \p x / \p y.
+ * - fmodf(
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ * , \p y) returns 
+ * \cuda_math_formula \pm 0 \end_cuda_math_formula
+ *  if \p y is not zero.
+ * - fmodf(\p x, 
+ * \cuda_math_formula \pm \infty \end_cuda_math_formula
+ * ) returns \p x if \p x is finite.
+ * - fmodf(\p x, \p y) returns NaN if \p x is 
+ * \cuda_math_formula \pm\infty \end_cuda_math_formula
+ *  or \p y is zero.
+ * - If either argument is NaN, NaN is returned.
+ *
+ * \note_accuracy_single
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  fmodf(float x, float y) __THROW;
+#if defined(__QNX__)
+/* redeclare some builtins that QNX uses */
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float _FLog(float, int);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float _FCosh(float, float);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float _FSinh(float, float);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float _FSinx(float, unsigned int, int);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int _FDsign(float);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ int _Dsign(double);
+#endif
+#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
+} /* std */
+#endif
+#endif /* _WIN32 && (_M_AMD64 || _M_ARM64) */
+
+}
+
+#if !defined(__CUDACC_RTC__)
+#include <math.h>
+#include <stdlib.h>
+
+#ifndef __CUDA_INTERNAL_SKIP_CPP_HEADERS__
+#include <cmath>
+#include <cstdlib>
+#endif /* __CUDA_INTERNAL_SKIP_CPP_HEADERS__ */
+#endif /* __CUDACC_RTC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC_RTC__)
+
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(long double x);
+
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(long double x);
+
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(long double x);
+
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(long double x);
+
+#elif defined(__GNUC__)
+
+#undef signbit
+#undef isfinite
+#undef isnan
+#undef isinf
+
+#if defined(__APPLE__)
+
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(float x);
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(double x);
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(long double x);
+
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(float x); 
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(double x);
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(long double x);
+
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(double x) throw();
+#if !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(float x);
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(long double x);
+#else /* !(!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000) */
+template <typename T>
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool __libcpp_isnan(T) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isnan(float x) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY  __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isnan(long double x) _NOEXCEPT;
+#endif /* !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000 */
+
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(double x) throw();
+#if !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(float x);
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(long double x);
+#else /* !(!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000) */
+template <typename T>
+__cudart_builtin__ __DEVICE_FUNCTIONS_DECL__ bool __libcpp_isinf(T) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isinf(float x) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isinf(long double x) _NOEXCEPT;
+#endif /* !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 7000 */
+
+#else /* __APPLE__ */
+
+#if ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)
+#if !defined(_NVHPC_CUDA)
+namespace std {
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool signbit(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool signbit(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool signbit(long double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isfinite(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isfinite(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isfinite(long double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isnan(float x);
+/* GCC 6.1 uses ::isnan(double x) for isnan(double x) if the condition is true */
+#if _GLIBCXX_HAVE_OBSOLETE_ISNAN && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(double x) throw();
+#else /* !(_GLIBCXX_HAVE_OBSOLETE_ISNAN && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC) */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isnan(double x);
+#endif /* _GLIBCXX_HAVE_OBSOLETE_ISNAN && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isnan(long double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isinf(float x);
+/* GCC 6.1 uses ::isinf(double x) for isinf(double x) if the condition is true. */
+#if _GLIBCXX_HAVE_OBSOLETE_ISINF && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(double x) throw();
+#else /* !(_GLIBCXX_HAVE_OBSOLETE_ISINF && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC) */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isinf(double x);
+#endif /* _GLIBCXX_HAVE_OBSOLETE_ISINF && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ constexpr bool isinf(long double x);
+}
+#endif
+
+#else /* !(((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)) */
+
+#if defined(__QNX__)
+#if (__QNX__) && !defined(_LIBCPP_VERSION)
+/* QNX defines functions in std, need to declare them here */
+namespace std {
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool signbit(float x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool signbit(double x);
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool signbit(long double x);
+}
+#else
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool signbit(const float x);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool signbit(const double x);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool signbit(const long double x);
+#endif
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isfinite(const float a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isfinite(const double a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isfinite(const long double a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isnan(const float a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isnan(const double a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isnan(const long double a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isinf(const float a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isinf(const double a);
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isinf(const long double a);
+#else /* ! __QNX__ */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(const float x);
+#if defined(__ICC)
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(const double x) throw();
+#else /* !__ICC */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(const double x);
+#endif /* __ICC */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int signbit(const long double x);
+
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(const float x);
+#if defined(__ICC)
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(const double x) throw();
+#else /* !__ICC */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(const double x);
+#endif /* __ICC */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isfinite(const long double x);
+
+#if (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000
+template <typename T>
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool __libcpp_isnan(T) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isnan(float x) _NOEXCEPT;
+#else /* !((defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(float x);
+#endif /* (defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000 */
+#if defined(__ANDROID__) || defined(__HORIZON__)
+#if !defined(_LIBCPP_VERSION)
+__forceinline__
+#endif  /* !defined(_LIBCPP_VERSION) */
+#if _LIBCPP_VERSION >= 7000
+#ifdef _LIBCPP_PREFERRED_OVERLOAD
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_PREFERRED_OVERLOAD __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isnan(double x) _NOEXCEPT;
+#endif /* _LIBCPP_PREFERRED_OVERLOAD */
+#else /* _LIBCPP_VERSION < 7000 */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(double x);
+#endif /* _LIBCPP_VERSION >= 7000 */
+#else /* !(__ANDROID__ || __HORIZON__) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(double x) throw();
+#endif /* __ANDROID__ */
+#if (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000
+inline _LIBCPP_INLINE_VISIBILITY  __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isnan(long double x) _NOEXCEPT;
+#else /* !( (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isnan(long double x);
+#endif /* (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000 */
+
+#if (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000
+static __inline__ __cudart_builtin__ __DEVICE_FUNCTIONS_DECL__ unsigned __FLOAT_BITS(float __f);
+static __inline__ __cudart_builtin__ __DEVICE_FUNCTIONS_DECL__ unsigned long long __DOUBLE_BITS(double __f);
+template <typename T>
+__cudart_builtin__ __DEVICE_FUNCTIONS_DECL__ bool __libcpp_isinf(T) _NOEXCEPT;
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isinf(float x) _NOEXCEPT;
+#else /* !( (defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(float x);
+#endif /* (defined(__ANDROID__) || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000 */
+
+#if defined(__ANDROID__) || defined(__HORIZON__)
+#if !defined(_LIBCPP_VERSION)
+__forceinline__
+#endif  /* !defined(_LIBCPP_VERSION) */
+#if _LIBCPP_VERSION >= 7000
+#ifdef _LIBCPP_PREFERRED_OVERLOAD
+_LIBCPP_INLINE_VISIBILITY _LIBCPP_PREFERRED_OVERLOAD __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isinf(double x) _NOEXCEPT;
+#endif /* _LIBCPP_PREFERRED_OVERLOAD */
+#else /* _LIBCPP_VERSION < 7000 */
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(double x);
+#endif /* _LIBCPP_VERSION >= 7000 */
+#else /* ! (__ANDROID__  || __HORIZON__) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(double x) throw();
+#endif /* __ANDROID__ || __HORIZON__ */
+#if (defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000
+inline _LIBCPP_INLINE_VISIBILITY __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool isinf(long double x) _NOEXCEPT;
+#else /* !( (defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000) */
+__forceinline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ int isinf(long double x);
+#endif  /* (defined(__ANDROID__)  || defined(__HORIZON__)) && _LIBCPP_VERSION >= 8000 */
+#endif /* __QNX__  */
+
+#endif /* ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L) */
+#endif /* __APPLE__ */
+
+#if !defined(_LIBCPP_VERSION)
+#if defined(__clang__)
+#if __has_include(<ext/random>)
+#define __NV_GLIBCXX_VERSION 40800
+#endif /* __has_include(<random>) */
+#endif /* __clang__ */
+
+#if !defined(__NV_GLIBCXX_VERSION)
+#define __NV_GLIBCXX_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) 
+#endif /* !__NV_GLIBCXX_VERSION */
+#endif /* !defined(_LIBCPP_VERSION) */
+
+#if !defined(__HORIZON__) || !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 3800
+#if defined(__arm__) && !defined(_STLPORT_VERSION) && !_GLIBCXX_USE_C99
+#if !defined(__ANDROID__) || (defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION < 40800)
+
+#if defined(__QNX__)
+/* QNX defines functions in std, need to declare them here */
+namespace std {
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs (long long int a);
+}
+#elif defined(__HORIZON__)
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+_LIBCPP_BEGIN_NAMESPACE_STD
+__DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs (long long int a) throw();
+_LIBCPP_END_NAMESPACE_STD
+#else
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs(long long int a);
+#endif /* __QNX__ || __HORIZON__*/
+
+#endif /* !__ANDROID__ || (defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION < 40800) */
+#endif /* __arm__ && !_STLPORT_VERSION && !_GLIBCXX_USE_C99 */
+#endif /* !defined(__HORIZON__) || !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 3800 */
+
+#if defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION < 40800 && !defined(__ibmxl__)
+
+#if !defined(_STLPORT_VERSION)
+namespace __gnu_cxx
+{
+#endif /* !_STLPORT_VERSION */
+
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs(long long int a);
+
+#if !defined(_STLPORT_VERSION)
+}
+#endif /* !_STLPORT_VERSION */
+
+#endif /* defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION < 40800 && !__ibmxl__ */
+
+namespace std
+{
+  template<typename T> extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ T __pow_helper(T, int);
+  template<typename T> extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ T __cmath_power(T, unsigned int);
+}
+
+using std::abs;
+using std::fabs;
+using std::ceil;
+using std::floor;
+using std::sqrt;
+#if !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 3800
+using std::pow;
+#endif /* !defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 3800 */
+using std::log;
+using std::log10;
+using std::fmod;
+using std::modf;
+using std::exp;
+using std::frexp;
+using std::ldexp;
+using std::asin;
+using std::sin;
+using std::sinh;
+using std::acos;
+using std::cos;
+using std::cosh;
+using std::atan;
+using std::atan2;
+using std::tan;
+using std::tanh;
+
+#elif defined(_WIN32)
+
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __CUDA_MATH_CRTIMP double __cdecl _hypot(double x, double y);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __CUDA_MATH_CRTIMP float  __cdecl _hypotf(float x, float y);
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __DEVICE_FUNCTIONS_DECL__ int signbit(long double a);
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#if _MSC_VER >= 1900
+#define __SIGNBIT_THROW throw()
+#else
+#define __SIGNBIT_THROW
+#endif
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ bool signbit(long double) __SIGNBIT_THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __device_builtin__ __CUDA_MATH_CRTIMP int _ldsign(long double);
+#undef __SIGNBIT_THROW
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Return the sign bit of the input.
+ *
+ * Determine whether the floating-point value \p a is negative.
+ *
+ * \return
+ * Reports the sign bit of all values including infinities, zeros, and NaNs.
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is negative.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is negative. 
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE signbit(double a);
+#undef __RETURN_TYPE 
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+#if _MSC_VER >= 1900
+#define __SIGNBIT_THROW throw()
+#else
+#define __SIGNBIT_THROW
+#endif
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Return the sign bit of the input.
+ *
+ * Determine whether the floating-point value \p a is negative.
+ *
+ * \return
+ * Reports the sign bit of all values including infinities, zeros, and NaNs.
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is negative.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is negative. 
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __RETURN_TYPE signbit(double) __SIGNBIT_THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __device_builtin__ __CUDA_MATH_CRTIMP int _dsign(double);
+#undef __RETURN_TYPE 
+#undef __SIGNBIT_THROW
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * \brief Return the sign bit of the input.
+ *
+ * Determine whether the floating-point value \p a is negative.
+ *
+ * \return
+ * Reports the sign bit of all values including infinities, zeros, and NaNs.
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is negative.
+ * - With other host compilers: __RETURN_TYPE is 'int'.  Returns a nonzero value 
+ * if and only if \p a is negative.  
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE signbit(float a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+#if _MSC_VER >= 1900
+#define __SIGNBIT_THROW throw()
+#else
+#define __SIGNBIT_THROW
+#endif
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * \brief Return the sign bit of the input.
+ *
+ * Determine whether the floating-point value \p a is negative.
+ *
+ * \return
+ * Reports the sign bit of all values including infinities, zeros, and NaNs.
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is negative.
+ * - With other host compilers: __RETURN_TYPE is 'int'.  Returns a nonzero value 
+ * if and only if \p a is negative.  
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __RETURN_TYPE signbit(float) __SIGNBIT_THROW;
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ __device_builtin__ __CUDA_MATH_CRTIMP int _fdsign(float);
+#undef __RETURN_TYPE
+#undef __SIGNBIT_THROW
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __DEVICE_FUNCTIONS_DECL__ int isinf(long double a);
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isinf(long double a);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is infinite.
+ *
+ * Determine whether the floating-point value \p a is an infinite value
+ * (positive or negative).
+ * \return
+ * - With Visual Studio 2013 host compiler: Returns true if and only 
+ * if \p a is an infinite value.
+ * - With other host compilers: Returns a nonzero value if and only 
+ * if \p a is an infinite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isinf(double a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is infinite.
+ *
+ * Determine whether the floating-point value \p a is an infinite value
+ * (positive or negative).
+ * \return
+ * - With Visual Studio 2013 host compiler: Returns true if and only 
+ * if \p a is an infinite value.
+ * - With other host compilers: Returns a nonzero value if and only 
+ * if \p a is an infinite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isinf(double a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * \brief Determine whether argument is infinite.
+ *
+ * Determine whether the floating-point value \p a is an infinite value
+ * (positive or negative).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is an infinite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a nonzero 
+ * value if and only if \p a is an infinite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isinf(float a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * \brief Determine whether argument is infinite.
+ *
+ * Determine whether the floating-point value \p a is an infinite value
+ * (positive or negative).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns 
+ * true if and only if \p a is an infinite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a nonzero 
+ * value if and only if \p a is an infinite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isinf(float a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __DEVICE_FUNCTIONS_DECL__ int isnan(long double a);
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isnan(long double a);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is a NaN.
+ *
+ * Determine whether the floating-point value \p a is a NaN.
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. 
+ * Returns true if and only if \p a is a NaN value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is a NaN value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isnan(double a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is a NaN.
+ *
+ * Determine whether the floating-point value \p a is a NaN.
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. 
+ * Returns true if and only if \p a is a NaN value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is a NaN value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isnan(double a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * 
+ * \brief Determine whether argument is a NaN.
+ *
+ * Determine whether the floating-point value \p a is a NaN.
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. 
+ * Returns true if and only if \p a is a NaN value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is a NaN value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isnan(float a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * 
+ * 
+ * \brief Determine whether argument is a NaN.
+ *
+ * Determine whether the floating-point value \p a is a NaN.
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. 
+ * Returns true if and only if \p a is a NaN value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns a 
+ * nonzero value if and only if \p a is a NaN value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isnan(float a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __DEVICE_FUNCTIONS_DECL__ int isfinite(long double a);
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ bool isfinite(long double a);
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is finite.
+ *
+ * Determine whether the floating-point value \p a is a finite value
+ * (zero, subnormal, or normal and not infinity or NaN).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns
+ * true if and only if \p a is a finite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns 
+ * a nonzero value if and only if \p a is a finite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isfinite(double a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * 
+ * \brief Determine whether argument is finite.
+ *
+ * Determine whether the floating-point value \p a is a finite value
+ * (zero, subnormal, or normal and not infinity or NaN).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns
+ * true if and only if \p a is a finite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns 
+ * a nonzero value if and only if \p a is a finite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isfinite(double a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#define __RETURN_TYPE int
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Determine whether argument is finite.
+ *
+ * Determine whether the floating-point value \p a is a finite value
+ * (zero, subnormal, or normal and not infinity or NaN).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns
+ * true if and only if \p a is a finite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns 
+ * a nonzero value if and only if \p a is a finite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isfinite(float a);
+#undef __RETURN_TYPE
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+#define __RETURN_TYPE bool
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Determine whether argument is finite.
+ *
+ * Determine whether the floating-point value \p a is a finite value
+ * (zero, subnormal, or normal and not infinity or NaN).
+ *
+ * \return
+ * - With Visual Studio 2013 host compiler: __RETURN_TYPE is 'bool'. Returns
+ * true if and only if \p a is a finite value.
+ * - With other host compilers: __RETURN_TYPE is 'int'. Returns 
+ * a nonzero value if and only if \p a is a finite value.
+ */
+static __inline__ __DEVICE_FUNCTIONS_DECL__ __RETURN_TYPE isfinite(float a);
+#undef __RETURN_TYPE
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+template<class T> extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ T _Pow_int(T, int);
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the absolute value of the input \p long \p long \p int argument.
+ *
+ * Calculate the absolute value of the input argument \p a.
+ *
+ * \return
+ * Returns the absolute value of the input argument.
+ * - abs(\p LLONG_MIN) is \p Undefined
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs(long long int a);
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+template<class T> extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ T _Pow_int(T, int) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs(long long int) throw();
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#endif /* __CUDACC_RTC__ */
+
+#if __cplusplus >= 201103L
+#define __NV_NOEXCEPT noexcept
+#else /* !__cplusplus >= 201103L */
+#define __NV_NOEXCEPT throw()
+#endif /* __cplusplus >= 201103L */
+
+#if defined(_LIBCPP_VERSION) && defined(_LIBCPP_BEGIN_NAMESPACE_STD) && !defined(_STLPORT_VERSION)
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++11-extensions"
+#endif /* __clang__ */
+#if _LIBCPP_VERSION < 3800
+_LIBCPP_BEGIN_NAMESPACE_STD
+#endif /* _LIBCPP_VERSION < 3800 */
+#elif defined(__GNUC__) && !defined(_STLPORT_VERSION)
+namespace std {
+#endif /* defined(_LIBCPP_VERSION) && defined(_LIBCPP_BEGIN_NAMESPACE_STD) && !defined(_STLPORT_VERSION) ||
+          __GNUC__ && !_STLPORT_VERSION */
+
+#if defined(__CUDACC_RTC__) || defined(__GNUC__)
+
+#if defined(__CUDACC_RTC__) || \
+    (defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION >= 40800) || \
+    defined(__ibmxl__)
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int abs(long long int);
+#endif /* __CUDACC__RTC__ ||
+          (defined(__NV_GLIBCXX_VERSION) && __NV_GLIBCXX_VERSION >= 40800) ||
+          __ibmxl__ */
+
+#endif /* __CUDACC_RTC__ || __GNUC__ */
+
+#if defined(__CUDACC_RTC__) || \
+    (!defined(_MSC_VER) || _MSC_VER < 1800) && \
+    (!defined(_LIBCPP_VERSION) || (_LIBCPP_VERSION < 1101))
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the absolute value of the input \p long \p int argument.
+ *
+ * Calculate the absolute value of the input argument \p a.
+ *
+ * \return
+ * Returns the absolute value of the input argument.
+ * - abs(\p LONG_MIN) is \p Undefined
+ */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long int __cdecl abs(long int a);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl abs(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ double   __cdecl abs(double);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl fabs(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl ceil(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl floor(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sqrt(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl pow(float, float);
+
+#if !defined(__QNX__)
+     
+#if defined(__GNUC__) && __cplusplus >= 201103L && !defined(_LIBCPP_VERSION)
+template<typename _Tp, typename _Up>
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__
+typename __gnu_cxx::__promote_2<_Tp, _Up>::__type pow(_Tp, _Up);
+#else  /* !(defined(__GNUC__) && __cplusplus >= 201103L && !defined(_LIBCPP_VERSION)) */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl pow(float, int);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ double   __cdecl pow(double, int);
+#endif  /* defined(__GNUC__) && __cplusplus >= 201103L && !defined(_LIBCPP_VERSION) */
+     
+#endif  /* !defined(__QNX__) */
+
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl log(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl log10(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl fmod(float, float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl modf(float, float*);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl exp(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl frexp(float, int*);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl ldexp(float, int);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl asin(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sin(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sinh(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl acos(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl cos(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl cosh(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl atan(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl atan2(float, float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl tan(float);
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl tanh(float);
+#else /* __CUDACC_RTC__ ||
+         (!defined(_MSC_VER) || _MSC_VER < 1800) &&
+         (!defined(_LIBCPP_VERSION) || (_LIBCPP_VERSION < 1101)) */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long int __cdecl abs(long int) throw();
+#if defined(_LIBCPP_VERSION)
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ long long int __cdecl abs(long long int) throw();
+#endif /* defined(_LIBCPP_VERSION) */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl abs(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ double   __cdecl abs(double) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl fabs(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl ceil(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl floor(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sqrt(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl pow(float, float) throw();
+#if defined(_LIBCPP_VERSION)
+#if (defined (__ANDROID__) || defined(__HORIZON__)) && (_LIBCPP_VERSION >= 9000)
+template <class _A1, class _A2>
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__
+#if _LIBCPP_VERSION >= 14000
+typename std::__enable_if_t
+#else /* _LIBCPP_VERSION < 14000 */
+typename std::_EnableIf
+#endif /*  _LIBCPP_VERSION >= 14000 */
+<
+    std::is_arithmetic<_A1>::value &&
+    std::is_arithmetic<_A2>::value,
+    std::__promote<_A1, _A2>
+>::type pow(_A1 __lcpp_x, _A2 __lcpp_y) __NV_NOEXCEPT;
+#elif (defined(__APPLE__) && __clang_major__ >= 7) || _LIBCPP_VERSION >= 3800 || defined(__QNX__)
+template <class _Tp, class _Up>
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__
+#if defined(__QNX__) && (_LIBCPP_VERSION >= 160000)
+typename std::__enable_if_t <
+#elif _LIBCPP_VERSION >= 13000
+typename std::enable_if <
+#else /* #defined(__QNX__) && (_LIBCPP_VERSION >= 160000) */
+typename std::__lazy_enable_if <
+#endif /* _LIBCPP_VERSION >= 160000  */
+  std::is_arithmetic<_Tp>::value && std::is_arithmetic<_Up>::value,
+  std::__promote<_Tp, _Up>
+>::type pow(_Tp __x, _Up __y) __NV_NOEXCEPT;
+#else /* !((__APPLE__ && __clang_major__ >= 7) || _LIBCPP_VERSION >= 3800) */
+template <class _Tp, class _Up>
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__
+typename enable_if <
+  std::is_arithmetic<_Tp>::value && std::is_arithmetic<_Up>::value,
+  typename std::__promote<_Tp, _Up>::type
+>::type pow(_Tp __x, _Up __y) __NV_NOEXCEPT;
+#endif /* (__APPLE__ && __clang_major__ >= 7) || _LIBCPP_VERSION >= 3800 */
+#else /* !defined(_LIBCPP_VERSION) */
+#if !(defined(__GNUC__) && __cplusplus >= 201103L)
+#if (defined(_MSC_VER) && (_MSC_VER >= 1928)) && !(defined __CUDA_INTERNAL_SKIP_CPP_HEADERS__)
+template <class _Ty1, class _Ty2, ::std:: enable_if_t< ::std:: is_arithmetic_v<_Ty1> && ::std:: is_arithmetic_v<_Ty2>, int> > [[nodiscard]] __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ ::std:: _Common_float_type_t<_Ty1, _Ty2> __cdecl pow(_Ty1 _Left, _Ty2 _Right) noexcept;
+#else
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl pow(float, int) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ double   __cdecl pow(double, int) throw();
+#endif /* (defined(_MSC_VER) && (_MSC_VER >= 1928)) && !(defined __CUDA_INTERNAL_SKIP_CPP_HEADERS__) */
+#endif /* !(defined(__GNUC__) && __cplusplus >= 201103L) */
+#endif /* defined(_LIBCPP_VERSION) */
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl log(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl log10(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl fmod(float, float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl modf(float, float*) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl exp(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl frexp(float, int*) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl ldexp(float, int) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl asin(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sin(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl sinh(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl acos(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl cos(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl cosh(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl atan(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl atan2(float, float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl tan(float) throw();
+extern __DEVICE_FUNCTIONS_DECL__ __cudart_builtin__ float    __cdecl tanh(float) throw();
+#endif /* __CUDACC_RTC__ ||
+          (!defined(_MSC_VER) || _MSC_VER < 1800) &&
+          (!defined(_LIBCPP_VERSION) || (_LIBCPP_VERSION < 1101)) */
+
+#if defined(_LIBCPP_VERSION) && defined(_LIBCPP_END_NAMESPACE_STD) && !defined(_STLPORT_VERSION)
+#if _LIBCPP_VERSION < 3800
+_LIBCPP_END_NAMESPACE_STD
+#endif /* _LIBCPP_VERSION < 3800 */
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif /* __clang__ */
+#elif defined(__GNUC__) && !defined(_STLPORT_VERSION)
+}
+#endif /* defined(_LIBCPP_VERSION) && defined(_LIBCPP_BEGIN_NAMESPACE_STD) && !defined(_STLPORT_VERSION) ||
+          __GNUC__ && !_STLPORT_VERSION */
+
+#undef __DEVICE_FUNCTIONS_DECL__
+#undef __NV_NOEXCEPT
+
+#if defined(__CUDACC_RTC__)
+#define __MATH_FUNCTIONS_DECL__ __host__ __device__
+#define __MATH_FUNCTIONS_DEVICE_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __MATH_FUNCTIONS_DECL__ static inline __host__ __device__ __cudart_builtin__
+#define __MATH_FUNCTIONS_DEVICE_DECL__ static inline __device__ __cudart_builtin__
+#endif /* __CUDACC_RTC__ */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+#if defined(__QNX__) || (defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 3800)
+#if defined(__QNX__) && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 8000)
+#if defined(_LIBCPP_VERSION)
+#define __NV_NOEXCEPT _NOEXCEPT
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+#define __NV_NOEXCEPT
+namespace std {
+__host__ __device__ __cudart_builtin__ int ilogbf(float a);
+#endif
+#else /* !(defined(__QNX__) && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 8000)) */
+#define __NV_NOEXCEPT _NOEXCEPT
+#endif /* defined(__QNX__) && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 8000) */
+__host__ __device__ __cudart_builtin__ float logb(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ int ilogb(float a) __NV_NOEXCEPT;
+
+__host__ __device__ __cudart_builtin__ float scalbn(float a, int b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float scalbln(float a, long int b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float exp2(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float expm1(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float log2(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float log1p(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float acosh(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float asinh(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float atanh(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float hypot(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float cbrt(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float erf(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float erfc(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float lgamma(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float tgamma(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float copysign(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float nextafter(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float remainder(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float remquo(float a, float b, int *quo) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float round(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ long int lround(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ long long int llround(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float trunc(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float rint(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ long int lrint(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ long long int llrint(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float nearbyint(float a) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float fdim(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float fma(float a, float b, float c) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float fmax(float a, float b) __NV_NOEXCEPT;
+__host__ __device__ __cudart_builtin__ float fmin(float a, float b) __NV_NOEXCEPT;
+#if defined(__QNX__) && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 8000)
+#if defined(_LIBCPP_VERSION)
+_LIBCPP_END_NAMESPACE_STD
+using _VSTD::logb;
+using _VSTD::ilogb;
+using _VSTD::scalbn;
+using _VSTD::scalbln;
+using _VSTD::exp2;
+using _VSTD::expm1;
+using _VSTD::log2;
+using _VSTD::log1p;
+using _VSTD::acosh;
+using _VSTD::asinh;
+using _VSTD::atanh;
+using _VSTD::hypot;
+using _VSTD::cbrt;
+using _VSTD::erf;
+using _VSTD::erfc;
+using _VSTD::lgamma;
+using _VSTD::tgamma;
+using _VSTD::copysign;
+using _VSTD::nextafter;
+using _VSTD::remainder;
+using _VSTD::remquo;
+using _VSTD::round;
+using _VSTD::lround;
+using _VSTD::llround;
+using _VSTD::trunc;
+using _VSTD::rint;
+using _VSTD::lrint;
+using _VSTD::llrint;
+using _VSTD::nearbyint;
+using _VSTD::fdim;
+using _VSTD::fma;
+using _VSTD::fmax;
+using _VSTD::fmin;
+#else
+}
+#endif
+#endif /* defined(__QNX__) && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 8000) */
+#undef __NV_NOEXCEPT
+#else /* !(defined(__QNX__ ) || (defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 3800)) */
+#if ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)
+namespace std {
+__host__ __device__ __cudart_builtin__ constexpr float logb(float a);
+__host__ __device__ __cudart_builtin__ constexpr int ilogb(float a);
+__host__ __device__ __cudart_builtin__ constexpr float scalbn(float a, int b);
+__host__ __device__ __cudart_builtin__ constexpr float scalbln(float a, long int b);
+__host__ __device__ __cudart_builtin__ constexpr float exp2(float a);
+__host__ __device__ __cudart_builtin__ constexpr float expm1(float a);
+__host__ __device__ __cudart_builtin__ constexpr float log2(float a);
+__host__ __device__ __cudart_builtin__ constexpr float log1p(float a);
+__host__ __device__ __cudart_builtin__ constexpr float acosh(float a);
+__host__ __device__ __cudart_builtin__ constexpr float asinh(float a);
+__host__ __device__ __cudart_builtin__ constexpr float atanh(float a);
+__host__ __device__ __cudart_builtin__ constexpr float hypot(float a, float b);
+__host__ __device__ __cudart_builtin__ constexpr float cbrt(float a);
+__host__ __device__ __cudart_builtin__ constexpr float erf(float a);
+__host__ __device__ __cudart_builtin__ constexpr float erfc(float a);
+__host__ __device__ __cudart_builtin__ constexpr float lgamma(float a);
+__host__ __device__ __cudart_builtin__ constexpr float tgamma(float a);
+__host__ __device__ __cudart_builtin__ constexpr float copysign(float a, float b);
+__host__ __device__ __cudart_builtin__ constexpr float nextafter(float a, float b);
+__host__ __device__ __cudart_builtin__ constexpr float remainder(float a, float b);
+__host__ __device__ __cudart_builtin__ float remquo(float a, float b, int *quo);
+__host__ __device__ __cudart_builtin__ constexpr float round(float a);
+__host__ __device__ __cudart_builtin__ constexpr long int lround(float a);
+__host__ __device__ __cudart_builtin__ constexpr long long int llround(float a);
+__host__ __device__ __cudart_builtin__ constexpr float trunc(float a);
+__host__ __device__ __cudart_builtin__ constexpr float rint(float a);
+__host__ __device__ __cudart_builtin__ constexpr long int lrint(float a);
+__host__ __device__ __cudart_builtin__ constexpr long long int llrint(float a);
+__host__ __device__ __cudart_builtin__ constexpr float nearbyint(float a);
+__host__ __device__ __cudart_builtin__ constexpr float fdim(float a, float b);
+__host__ __device__ __cudart_builtin__ constexpr float fma(float a, float b, float c);
+__host__ __device__ __cudart_builtin__ constexpr float fmax(float a, float b);
+__host__ __device__ __cudart_builtin__ constexpr float fmin(float a, float b);
+}
+#else /* !(((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)) */
+__MATH_FUNCTIONS_DECL__ float logb(float a);
+
+__MATH_FUNCTIONS_DECL__ int ilogb(float a);
+
+__MATH_FUNCTIONS_DECL__ float scalbn(float a, int b);
+
+__MATH_FUNCTIONS_DECL__ float scalbln(float a, long int b);
+
+__MATH_FUNCTIONS_DECL__ float exp2(float a);
+
+__MATH_FUNCTIONS_DECL__ float expm1(float a);
+
+__MATH_FUNCTIONS_DECL__ float log2(float a);
+
+__MATH_FUNCTIONS_DECL__ float log1p(float a);
+
+__MATH_FUNCTIONS_DECL__ float acosh(float a);
+
+__MATH_FUNCTIONS_DECL__ float asinh(float a);
+
+__MATH_FUNCTIONS_DECL__ float atanh(float a);
+
+__MATH_FUNCTIONS_DECL__ float hypot(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float cbrt(float a);
+
+__MATH_FUNCTIONS_DECL__ float erf(float a);
+
+__MATH_FUNCTIONS_DECL__ float erfc(float a);
+
+__MATH_FUNCTIONS_DECL__ float lgamma(float a);
+
+__MATH_FUNCTIONS_DECL__ float tgamma(float a);
+
+__MATH_FUNCTIONS_DECL__ float copysign(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float nextafter(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float remainder(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float remquo(float a, float b, int *quo);
+
+__MATH_FUNCTIONS_DECL__ float round(float a);
+
+__MATH_FUNCTIONS_DECL__ long int lround(float a);
+
+__MATH_FUNCTIONS_DECL__ long long int llround(float a);
+
+__MATH_FUNCTIONS_DECL__ float trunc(float a);
+
+__MATH_FUNCTIONS_DECL__ float rint(float a);
+
+__MATH_FUNCTIONS_DECL__ long int lrint(float a);
+
+__MATH_FUNCTIONS_DECL__ long long int llrint(float a);
+
+__MATH_FUNCTIONS_DECL__ float nearbyint(float a);
+
+__MATH_FUNCTIONS_DECL__ float fdim(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float fma(float a, float b, float c);
+
+__MATH_FUNCTIONS_DECL__ float fmax(float a, float b);
+
+__MATH_FUNCTIONS_DECL__ float fmin(float a, float b);
+#endif /* ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L) */
+#endif /* defined(__QNX__) || (defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 3800) */
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+extern __host__ __device__ __cudart_builtin__ float __cdecl logb(float) throw();
+extern __host__ __device__ __cudart_builtin__ int   __cdecl ilogb(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl scalbn(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl scalbln(float, long int) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl exp2(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl expm1(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl log2(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl log1p(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl acosh(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl asinh(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl atanh(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl hypot(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl cbrt(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl erf(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl erfc(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl lgamma(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl tgamma(float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl copysign(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl nextafter(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl remainder(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl remquo(float, float, int *) throw();
+extern __host__ __device__ __cudart_builtin__ float __cdecl round(float) throw();
+extern __host__ __device__ __cudart_builtin__ long int      __cdecl lround(float) throw();
+extern __host__ __device__ __cudart_builtin__ long long int __cdecl llround(float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl trunc(float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl rint(float) throw();
+extern __host__ __device__ __cudart_builtin__ long int      __cdecl lrint(float) throw();
+extern __host__ __device__ __cudart_builtin__ long long int __cdecl llrint(float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl nearbyint(float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl fdim(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl fma(float, float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl fmax(float, float) throw();
+extern __host__ __device__ __cudart_builtin__ float         __cdecl fmin(float, float) throw();
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+__MATH_FUNCTIONS_DECL__ float exp10(const float a);
+
+__MATH_FUNCTIONS_DECL__ float rsqrt(const float a);
+
+__MATH_FUNCTIONS_DECL__ float rcbrt(const float a);
+
+__MATH_FUNCTIONS_DECL__ float sinpi(const float a);
+
+__MATH_FUNCTIONS_DECL__ float cospi(const float a);
+
+__MATH_FUNCTIONS_DECL__ void sincospi(const float a, float *const sptr, float *const cptr);
+
+__MATH_FUNCTIONS_DECL__ void sincos(const float a, float *const sptr, float *const cptr);
+
+__MATH_FUNCTIONS_DECL__ float j0(const float a);
+
+__MATH_FUNCTIONS_DECL__ float j1(const float a);
+
+__MATH_FUNCTIONS_DECL__ float jn(const int n, const float a);
+
+__MATH_FUNCTIONS_DECL__ float y0(const float a);
+
+__MATH_FUNCTIONS_DECL__ float y1(const float a);
+
+__MATH_FUNCTIONS_DECL__ float yn(const int n, const float a);
+
+__MATH_FUNCTIONS_DEVICE_DECL__ float cyl_bessel_i0(const float a);
+
+__MATH_FUNCTIONS_DEVICE_DECL__ float cyl_bessel_i1(const float a);
+
+__MATH_FUNCTIONS_DECL__ float erfinv(const float a);
+
+__MATH_FUNCTIONS_DECL__ float erfcinv(const float a);
+
+__MATH_FUNCTIONS_DECL__ float normcdfinv(const float a);
+
+__MATH_FUNCTIONS_DECL__ float normcdf(const float a);
+
+__MATH_FUNCTIONS_DECL__ float erfcx(const float a);
+
+__MATH_FUNCTIONS_DECL__ double copysign(const double a, const float b);
+
+__MATH_FUNCTIONS_DECL__ double copysign(const float a, const double b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int min(const unsigned int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p int and \p unsigned \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int min(const int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p int and \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int min(const unsigned int a, const int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ long int min(const long int a, const long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const unsigned long int a, const unsigned long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p long \p int and \p unsigned \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const long int a, const unsigned long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p long \p int and \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const unsigned long int a, const long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ long long int min(const long long int a, const long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const unsigned long long int a, const unsigned long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p long \p long \p int and \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const long long int a, const unsigned long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the minimum value of the input \p unsigned \p long \p long \p int and \p long \p long \p int arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const unsigned long long int a, const long long int b);
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the minimum value of the input \p float arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ * Behavior is equivalent to ::fminf() function.
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ float min(const float a, const float b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the minimum value of the input \p float arguments.
+ *
+ * Calculate the minimum value of the arguments \p a and \p b.
+ * Behavior is equivalent to ::fmin() function.
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double min(const double a, const double b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the minimum value of the input \p float and \p double arguments.
+ *
+ * Convert \p float argument \p a to \p double, followed by ::fmin().
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double min(const float a, const double b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the minimum value of the input \p double and \p float arguments.
+ *
+ * Convert \p float argument \p b to \p double, followed by ::fmin().
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double min(const double a, const float b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int max(const unsigned int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p int and \p unsigned \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int max(const int a, const unsigned int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p int and \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned int max(const unsigned int a, const int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ long int max(const long int a, const long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const unsigned long int a, const unsigned long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p long \p int and \p unsigned \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const long int a, const unsigned long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p long \p int and \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const unsigned long int a, const long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ long long int max(const long long int a, const long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const unsigned long long int a, const unsigned long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p long \p long \p int and \p unsigned \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const long long int a, const unsigned long long int b);
+
+/**
+ * \ingroup CUDA_MATH_INT
+ * \brief Calculate the maximum value of the input \p unsigned \p long \p long \p int and \p long \p long \p int arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b, perform integer promotion first.
+ */
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const unsigned long long int a, const long long int b);
+
+/**
+ * \ingroup CUDA_MATH_SINGLE
+ * \brief Calculate the maximum value of the input \p float arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ * Behavior is equivalent to ::fmaxf() function.
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ float max(const float a, const float b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the maximum value of the input \p float arguments.
+ *
+ * Calculate the maximum value of the arguments \p a and \p b.
+ * Behavior is equivalent to ::fmax() function.
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double max(const double a, const double b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the maximum value of the input \p float and \p double arguments.
+ *
+ * Convert \p float argument \p a to \p double, followed by ::fmax().
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double max(const float a, const double b);
+
+/**
+ * \ingroup CUDA_MATH_DOUBLE
+ * \brief Calculate the maximum value of the input \p double and \p float arguments.
+ *
+ * Convert \p float argument \p b to \p double, followed by ::fmax().
+ *
+ * Note, this is different from \p std:: specification
+ */
+__MATH_FUNCTIONS_DECL__ double max(const double a, const float b);
+
+#undef __MATH_FUNCTIONS_DECL__
+#undef __MATH_FUNCTIONS_DEVICE_DECL__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#undef EXCLUDE_FROM_RTC
+
+extern "C"{
+inline __device__ void *__nv_aligned_device_malloc(size_t size, size_t align)
+{
+  __device__ void *__nv_aligned_device_malloc_impl(size_t, size_t);
+  return __nv_aligned_device_malloc_impl(size, align);
+}
+}
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+* ONLY FOR HOST CODE! NOT FOR DEVICE EXECUTION                                 *
+*                                                                              *
+*******************************************************************************/
+
+#include <crt/func_macro.h>
+
+#if defined(_WIN32)
+#pragma warning (push)
+#pragma warning (disable : 4211)
+
+#endif /* _WIN32 */
+
+__func__(double rsqrt(double a));
+
+__func__(double rcbrt(double a));
+
+__func__(double sinpi(double a));
+
+__func__(double cospi(double a));
+
+__func__(void sincospi(double a, double *sptr, double *cptr));
+
+__func__(double erfinv(double a));
+
+__func__(double erfcinv(double a));
+
+__func__(double normcdfinv(double a));
+
+__func__(double normcdf(double a));
+
+__func__(double erfcx(double a));
+
+__func__(float rsqrtf(float a));
+
+__func__(float rcbrtf(float a));
+
+__func__(float sinpif(float a));
+
+__func__(float cospif(float a));
+
+__func__(void sincospif(float a, float *sptr, float *cptr));
+
+__func__(float erfinvf(float a));
+
+__func__(float erfcinvf(float a));
+
+__func__(float normcdfinvf(float a));
+
+__func__(float normcdff(float a));
+
+__func__(float erfcxf(float a));
+
+__func__(int min(int a, int b));
+
+__func__(unsigned int umin(unsigned int a, unsigned int b));
+
+__func__(long long int llmin(long long int a, long long int b));
+
+__func__(unsigned long long int ullmin(unsigned long long int a, unsigned long long int b));
+
+__func__(int max(int a, int b));
+
+__func__(unsigned int umax(unsigned int a, unsigned int b));
+
+__func__(long long int llmax(long long int a, long long int b));
+
+__func__(unsigned long long int ullmax(unsigned long long int a, unsigned long long int b));
+
+#if defined(_WIN32) || defined(__APPLE__) || defined (__ANDROID__)
+
+__func__(int __isnan(double a));
+
+#endif /* _WIN32 || __APPLE__ || __ANDROID__ */
+
+#if defined(_WIN32) || defined(__APPLE__) || defined (__QNX__)
+
+__func__(void sincos(double a, double *sptr, double *cptr));
+
+#endif /* _WIN32 || __APPLE__ || __QNX__ */
+
+#if defined(_WIN32) || defined(__APPLE__)
+
+__func__(double exp10(double a));
+
+__func__(float exp10f(float a));
+
+__func__(void sincosf(float a, float *sptr, float *cptr));
+
+__func__(int __isinf(double a));
+
+#endif /* _WIN32 || __APPLE__ */
+
+#if (defined(_WIN32) && (!defined(_MSC_VER) || _MSC_VER < 1800)) || defined (__ANDROID__)
+
+__func__(double log2(double a));
+
+#endif /* (_WIN32 && (!defined(_MSC_VER) || _MSC_VER < 1800)) || __ANDROID__ */
+
+#if defined(_WIN32)
+
+__func__(int __signbit(double a));
+
+__func__(int __finite(double a));
+
+__func__(int __signbitl(long double a));
+
+__func__(int __signbitf(float a));
+
+__func__(int __finitel(long double a));
+
+__func__(int __finitef(float a));
+
+__func__(int __isinfl(long double a));
+
+__func__(int __isinff(float a));
+
+__func__(int __isnanl(long double a));
+
+__func__(int __isnanf(float a));
+
+#endif /* _WIN32 */
+
+#if defined(_WIN32) && (!defined(_MSC_VER) || _MSC_VER < 1800)
+
+__func__(double copysign(double a, double b));
+
+__func__(double fmax(double a, double b));
+
+__func__(double fmin(double a, double b));
+
+__func__(double trunc(double a));
+
+__func__(double round(double a));
+
+__func__(long int lround(double a));
+
+__func__(long long int llround(double a));
+
+__func__(double rint(double a));
+
+__func__(double nearbyint(double a));
+
+__func__(long int lrint(double a));
+
+__func__(long long int llrint(double a));
+
+__func__(double fdim(double a, double b));
+
+__func__(double scalbn(double a, int b));
+
+__func__(double scalbln(double a, long int b));
+
+__func__(double exp2(double a));
+
+__func__(double log1p(double a));
+
+__func__(double expm1(double a));
+
+__func__(double cbrt(double a));
+
+__func__(double acosh(double a));
+
+__func__(double asinh(double a));
+
+__func__(double atanh(double a));
+
+__func__(int ilogb(double a));
+
+__func__(double logb(double a));
+
+__func__(double remquo(double a, double b, int *quo));
+
+__func__(double remainder(double a, double b));
+
+__func__(double fma (double a, double b, double c));
+
+__func__(double nextafter(double a, double b));
+
+__func__(double erf(double a));
+
+__func__(double erfc(double a));
+
+__func__(double lgamma(double a));
+
+__func__(unsigned long long int __internal_host_nan_kernel(const char *s));
+
+__func__(double nan(const char *tagp));
+
+__func__(double __host_tgamma_kernel(double a));
+
+__func__(double __host_stirling_poly(double a));
+
+__func__(double __host_tgamma_stirling(double a));
+
+__func__(double tgamma(double a));
+
+__func__(float fmaxf(float a, float b));
+
+__func__(float fminf(float a, float b));
+
+__func__(float roundf(float a));
+
+__func__(long int lroundf(float a));
+
+__func__(long long int llroundf(float a));
+
+__func__(float truncf(float a));
+
+__func__(float rintf(float a));
+
+__func__(float nearbyintf(float a));
+
+__func__(long int lrintf(float a));
+
+__func__(long long int llrintf(float a));
+
+__func__(float logbf(float a));
+
+__func__(float scalblnf(float a, long int b));
+
+__func__(float log2f(float a));
+
+__func__(float exp2f(float a));
+
+__func__(float acoshf(float a));
+
+__func__(float asinhf(float a));
+
+__func__(float atanhf(float a));
+
+__func__(float cbrtf(float a));
+
+__func__(float expm1f(float a));
+
+__func__(float fdimf(float a, float b));
+
+__func__(float log1pf(float a));
+
+__func__(float scalbnf(float a, int b));
+
+__func__(float fmaf(float a, float b, float c));
+
+__func__(int ilogbf(float a));
+
+__func__(float erff(float a));
+
+__func__(float erfcf(float a));
+
+__func__(float lgammaf(float a));
+
+__func__(float tgammaf(float a));
+
+__func__(float remquof(float a, float b, int *quo));
+
+__func__(float remainderf(float a, float b));
+
+__func__(float copysignf(float a, float b));
+
+__func__(float nextafterf(float a, float b));
+
+__func__(float nanf(const char *tagp));
+
+#endif /* _WIN32 && (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if defined(_WIN32)
+#pragma warning (pop)
+#endif /* _WIN32 */
+
+#endif /* !__CUDACC__ */
+
+#undef EXCLUDE_FROM_RTC
+
+#if !defined(__CUDACC_RTC__)
+
+#include "math_functions.hpp"
+
+#endif /* !__CUDACC_RTC__ */
+
+#endif /* !__MATH_FUNCTIONS_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/math_functions.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/math_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cc09b915ea07f8ef376f5c3640f963a09e86dbfd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/math_functions.hpp
@@ -0,0 +1,3398 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/math_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/math_functions.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_HPP__
+#endif
+
+#if !defined(__MATH_FUNCTIONS_HPP__)
+#define __MATH_FUNCTIONS_HPP__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if defined(__CUDACC_RTC__)
+
+__host__ __device__ __cudart_builtin__ int signbit(const float x) { return __signbitf(x); }
+__host__ __device__ __cudart_builtin__ int signbit(const double x) { return __signbit(x); }
+__host__ __device__ __cudart_builtin__ int signbit(const long double x) { return __signbitl(static_cast<double>(x));}
+
+__host__ __device__ __cudart_builtin__ int isfinite(const float x) { return __finitef(x); }
+__host__ __device__ __cudart_builtin__ int isfinite(const double x) { return __finite(x); }
+__host__ __device__ __cudart_builtin__ int isfinite(const long double x) { return __finitel(static_cast<double>(x)); }
+
+__host__ __device__ __cudart_builtin__ int isnan(const float x) { return __isnanf(x); }
+__host__ __device__ __cudart_builtin__ int isnan(const double x) { return __isnan(x); }
+__host__ __device__ __cudart_builtin__ int isnan(const long double x) { return __isnanl(static_cast<double>(x)); }
+
+__host__ __device__ __cudart_builtin__ int isinf(const float x) { return __isinff(x); }
+__host__ __device__ __cudart_builtin__ int isinf(const double x) { return __isinf(x); }
+__host__ __device__ __cudart_builtin__ int isinf(const long double x) { return __isinfl(static_cast<double>(x)); }
+
+__host__ __device__ __cudart_builtin__ long long int abs(const long long int a) { return llabs(a); }
+
+__host__ __device__ __cudart_builtin__ long int  abs(const long int in)        { return llabs(in); }
+__host__ __device__ __cudart_builtin__ float     abs(const float in)           { return fabsf(in); }
+__host__ __device__ __cudart_builtin__ double    abs(const double in)          { return fabs(in); }
+__host__ __device__ __cudart_builtin__ float     fabs(const float in)          { return fabsf(in); }
+__host__ __device__ __cudart_builtin__ float     ceil(const float in)          { return ceilf(in); }
+__host__ __device__ __cudart_builtin__ float     floor(const float in)         { return floorf(in); }
+__host__ __device__ __cudart_builtin__ float     sqrt(const float in)          { return sqrtf(in); }
+__host__ __device__ __cudart_builtin__ float     pow(const float a, const float b)   { return powf(a, b); }
+extern "C" __device__ float powif(float, int); 
+__host__ __device__ __cudart_builtin__ float     pow(const float a, const int b)     { return powif(a, b); }
+extern "C" __device__ double powi(double, int);
+__host__ __device__ __cudart_builtin__ double    pow(const double a, const int b)    { return powi(a, b); }
+__host__ __device__ __cudart_builtin__ float     log(const float in)           { return logf(in); }
+__host__ __device__ __cudart_builtin__ float     log10(const float in)         { return log10f(in); }
+__host__ __device__ __cudart_builtin__ float     fmod(const float a, const float b)  { return fmodf(a, b); }
+__host__ __device__ __cudart_builtin__ float     modf(const float a, float*b)  { return modff(a, b); }
+__host__ __device__ __cudart_builtin__ float     exp(const float in)           { return expf(in); }
+__host__ __device__ __cudart_builtin__ float     frexp(const float a, int*b)   { return frexpf(a, b); }
+__host__ __device__ __cudart_builtin__ float     ldexp(const float a, int b)   { return ldexpf(a, b); }
+__host__ __device__ __cudart_builtin__ float     asin(const float in)          { return asinf(in); }
+__host__ __device__ __cudart_builtin__ float     sin(const float in)           { return sinf(in); }
+__host__ __device__ __cudart_builtin__ float     sinh(const float in)          { return sinhf(in); }
+__host__ __device__ __cudart_builtin__ float     acos(const float in)          { return acosf(in); }
+__host__ __device__ __cudart_builtin__ float     cos(const float in)           { return cosf(in); }
+__host__ __device__ __cudart_builtin__ float     cosh(const float in)          { return coshf(in); }
+__host__ __device__ __cudart_builtin__ float     atan(const float in)          { return atanf(in); }
+__host__ __device__ __cudart_builtin__ float     atan2(const float a, const float b) { return atan2f(a, b); }
+__host__ __device__ __cudart_builtin__ float     tan(const float in)           { return tanf(in); }
+__host__ __device__ __cudart_builtin__ float     tanh(const float in)          { return tanhf(in); }
+
+#elif defined(__GNUC__)
+
+#undef signbit
+#undef isfinite
+#undef isnan
+#undef isinf
+
+#if defined(_LIBCPP_VERSION)
+extern "C" __device__ float powif(float, int);
+extern "C" __device__ double powi(double, int);
+#endif /* _LIBCPP_VERSION */
+
+#if defined(__APPLE__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const float x) { return __signbitf(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const double x) { return __signbitd(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const long double x) { return __signbitl(x);}
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const float x) { return __isfinitef(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x) { return __isfinited(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double x) { return __isfinite(x); }
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const double x) throw()  { return __isnand(x); }
+#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 7000
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const float x) { return __isnanf(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const long double x) { return __isnan(x); }
+#endif /* defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 7000 */
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const double x) throw()  { return __isinfd(x); }
+#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 7000
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const float x) { return __isinff(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const long double x) { return __isinf(x); }
+#endif /* defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 7000 */
+#else /* __APPLE__ */
+
+#if ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)
+#if defined(__CUDA_ARCH__)
+#define __NV_BUILTIN_FUNC_DECL__ __forceinline__ __host__ __device__ __cudart_builtin__
+#if _GLIBCXX_HAVE_OBSOLETE_ISNAN && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC
+__NV_BUILTIN_FUNC_DECL__ int  isnan(const double a) throw() { return __isnan(a); }
+__NV_BUILTIN_FUNC_DECL__ int  isinf(const double x) throw() { return __isinf(x); }
+#endif /* _GLIBCXX_HAVE_OBSOLETE_ISNAN && !_GLIBCXX_NO_OBSOLETE_ISINF_ISNAN_DYNAMIC */
+#undef __NV_BUILTIN_FUNC_DECL__
+#endif /* __CUDA_ARCH */
+#else /* !(((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)) */
+
+#if defined(__QNX__)
+#if defined(__QNX__) && defined(_LIBCPP_VERSION)
+static __inline__ __host__ __device__ __cudart_builtin__ bool signbit(const float x)
+{
+#if defined(__CUDA_ARCH__)
+  return (__signbitf(x) != 0);
+#else /* !__CUDA_ARCH__ */
+  return signbit<float>(x);
+#endif /* __CUDA_ARCH__ */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool signbit(const double x)
+{
+#if defined(__CUDA_ARCH__)
+  return (__signbit(x) != 0);
+#else /* !__CUDA_ARCH__ */
+  return signbit<double>(x);
+#endif /* __CUDA_ARCH__ */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool signbit(const long double x)
+{
+#if defined(__CUDA_ARCH__)
+  return (__signbitl(x) != 0);
+#else /* !__CUDA_ARCH__ */
+  return signbit<long double>(x);
+#endif /* __CUDA_ARCH__ */
+}
+#endif /* (__QNX__ && _LIBCPP_VERSION) */
+
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finitel(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finite(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finitef(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnanl(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnan(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnanf(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+
+static __inline__ __host__ __device__ __cudart_builtin__ bool isinf(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinfl(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isinf(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinf(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+static __inline__ __host__ __device__ __cudart_builtin__ bool isinf(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinff(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+
+#elif ( (defined(__ANDROID__) || defined(__HORIZON__)) && defined(_LIBCPP_VERSION))
+#if defined(__CUDA_ARCH__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const float x) { return __signbitf(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const double x) { return __signbit(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const long double x) { return __signbitl(x);}
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const float x) { return __finitef(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x) { return __finite(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double x) { return __finitel(x); }
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const double x)  { return __isnan(x); }
+#if _LIBCPP_VERSION < 8000
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const float x) { return __isnanf(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const long double x) { return __isnanl(x); }
+#endif  /* _LIBCPP_VERSION < 8000 */
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const double x) { return __isinf(x); }
+#if _LIBCPP_VERSION < 8000
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const float x) { return __isinff(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const long double x) { return __isinfl(x); }
+#endif /* _LIBCPP_VERSION < 8000 */
+#else /* !defined(__CUDA_ARCH__) */
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const float x) { return signbit<float>(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const double x) { return signbit<double>(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const long double x) { return signbit<long double>(x);}
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const float x) { return isfinite<float>(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x) { return isfinite<double>(x); }
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double x) { return isfinite<long double>(x); }
+
+#if _LIBCPP_VERSION < 8000
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const float x) { return isnan<float>(x); }
+/* int isnan(double) provided by math.h */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const long double x) { return isnan<long double>(x); }
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const float x) { return isinf<float>(x); }
+/* int isinf(double) provided by math.h */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const long double x) { return isinf<long double>(x); }
+#endif /* _LIBCPP_VERSION < 8000 */
+
+#endif  /* defined(__CUDA_ARCH__) */
+
+#else /* !__QNX__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const float x) { return __signbitf(x); }
+#if defined(__ICC)
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const double x) throw() { return __signbit(x); }
+#else /* !__ICC */
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const double x) { return __signbit(x); }
+#endif /* __ICC */
+__forceinline__ __host__ __device__ __cudart_builtin__ int signbit(const long double x) { return __signbitl(x);}
+
+#if defined(__ANDROID__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const float x) {
+#if defined(__CUDA_ARCH__)
+  return __finitef(x);
+#else	/* !__CUDA_ARCH__ */
+  return __isfinitef(x);
+#endif /* __CUDA_ARCH__ */
+}
+#else /* !__ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const float x) { return __finitef(x); }
+#endif  /* __ANDROID__ */
+
+#if defined(__ANDROID__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x)
+{
+#ifdef __CUDA_ARCH__
+  return __finite(x);
+#else  /* !__CUDA_ARCH__ */
+  return __isfinite(x);
+#endif  /* __CUDA_ARCH__ */
+}
+#elif defined(__ICC)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x) throw() { return __finite(x); }
+#else
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const double x) { return __finite(x); }
+#endif /* __ANDROID__ */
+
+#if defined(__ANDROID__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double x)
+{
+#ifdef __CUDA_ARCH__
+   return __finitel(x);
+#else /* !__CUDA_ARCH__ */
+   return __isfinitel(x);
+#endif  /* __CUDA_ARCH__ */
+}
+#else /* !__ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double x) { return __finitel(x); }
+#endif  /* __ANDROID__ */
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const float x) { return __isnanf(x); }
+#if defined(__ANDROID__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const double x) { return __isnan(x); }
+#else /* !__ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const double x) throw()  { return __isnan(x); }
+#endif /* __ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isnan(const long double x) { return __isnanl(x); }
+
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const float x) { return __isinff(x); }
+#if defined(__ANDROID__)
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const double x) { return __isinf(x); }
+#else /* !__ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const double x) throw()  { return __isinf(x); }
+#endif /* __ANDROID__ */
+__forceinline__ __host__ __device__ __cudart_builtin__ int isinf(const long double x) { return __isinfl(x); }
+#endif /* __QNX__ || __HORIZON__ */
+
+#endif /* ((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L) */
+#endif /* __APPLE__ */
+
+#if defined(__arm__) && !defined(_STLPORT_VERSION) && !_GLIBCXX_USE_C99
+#if !defined(__ANDROID__) || (!defined(_LIBCPP_VERSION) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)))
+
+#if !defined(__QNX__) && !defined(__HORIZON__)
+static __inline__ __host__ __device__ __cudart_builtin__ long long int abs(const long long int a)
+{
+  return llabs(a);
+}
+#endif /* !__QNX__ && !__HORIZON__*/
+
+#endif /* !defined(__ANDROID__) || (!defined(_LIBCPP_VERSION) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8))) */
+#endif /* __arm__ && !_STLPORT_VERSION && !_GLIBCXX_USE_C99 */
+
+#elif defined(_WIN32)
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int signbit(const long double a)
+{
+  return __signbitl(a);
+}
+
+static __inline__ __host__ __device__ __cudart_builtin__ int signbit(const double a)
+{
+  return __signbit(a);
+}
+
+static __inline__ __host__ __device__ __cudart_builtin__ int signbit(const float a)
+{
+  return __signbitf(a);
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isinf(const long double a)
+{
+  return __isinfl(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isinf(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinfl(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isinf(const double a)
+{
+  return __isinf(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isinf(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinf(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isinf(const float a)
+{
+  return __isinff(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ bool isinf(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isinff(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isinf<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isnan(const long double a)
+{
+  return __isnanl(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnanl(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isnan(const double a)
+{
+  return __isnan(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnan(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isnan(const float a)
+{
+  return __isnanf(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isnan(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__isnanf(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isnan<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isfinite(const long double a)
+{
+  return __finitel(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const long double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finitel(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<long double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isfinite(const double a)
+{
+  return __finite(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const double a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finite(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<double>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+static __inline__ __host__ __device__ __cudart_builtin__ int isfinite(const float a)
+{
+  return __finitef(a);
+}
+#else /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+static __inline__ __host__ __device__ __cudart_builtin__ bool isfinite(const float a)
+{
+#if defined(__CUDA_ARCH__)
+  return (__finitef(a) != 0);
+#else /* defined(__CUDA_ARCH__) */
+  return isfinite<float>(a);
+#endif /* defined(__CUDA_ARCH__) */
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__CUDACC_RTC__)
+#define __MATH_FUNCTIONS_DECL__ __host__ __device__
+#define __MATH_FUNCTIONS_DEVICE_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __MATH_FUNCTIONS_DECL__ static inline __host__ __device__
+#define __MATH_FUNCTIONS_DEVICE_DECL__ static inline __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__CUDACC_RTC__) || (!defined(_MSC_VER) || _MSC_VER < 1800)
+#if defined(__QNX__) && defined(_LIBCPP_VERSION)
+_LIBCPP_BEGIN_NAMESPACE_STD
+#endif /* __QNX__ && _LIBCPP_VERSION */
+#if !defined(__QNX__) && !(defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 3800)
+#if !(((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L))
+__MATH_FUNCTIONS_DECL__ float logb(const float a)
+{
+  return logbf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ int ilogb(const float a)
+{
+  return ilogbf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float scalbn(const float a, const int b)
+{
+  return scalbnf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float scalbln(const float a, const long int b)
+{
+  return scalblnf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float exp2(const float a)
+{
+  return exp2f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float expm1(const float a)
+{
+  return expm1f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float log2(const float a)
+{
+  return log2f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float log1p(const float a)
+{
+  return log1pf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float acosh(const float a)
+{
+  return acoshf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float asinh(const float a)
+{
+  return asinhf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float atanh(const float a)
+{
+  return atanhf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float hypot(const float a, const float b)
+{
+  return hypotf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float cbrt(const float a)
+{
+  return cbrtf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float erf(const float a)
+{
+  return erff(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float erfc(const float a)
+{
+  return erfcf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float lgamma(const float a)
+{
+  return lgammaf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float tgamma(const float a)
+{
+  return tgammaf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float copysign(const float a, const float b)
+{
+  return copysignf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float nextafter(const float a, const float b)
+{
+  return nextafterf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float remainder(const float a, const float b)
+{
+  return remainderf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float remquo(const float a, const float b, int *quo)
+{
+  return remquof(a, b, quo);
+}
+
+__MATH_FUNCTIONS_DECL__ float round(const float a)
+{
+  return roundf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ long int lround(const float a)
+{
+  return lroundf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ long long int llround(const float a)
+{
+  return llroundf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float trunc(const float a)
+{
+  return truncf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float rint(const float a)
+{
+  return rintf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ long int lrint(const float a)
+{
+  return lrintf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ long long int llrint(const float a)
+{
+  return llrintf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float nearbyint(const float a)
+{
+  return nearbyintf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float fdim(const float a, const float b)
+{
+  return fdimf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float fma(const float a, const float b, const float c)
+{
+  return fmaf(a, b, c);
+}
+
+__MATH_FUNCTIONS_DECL__ float fmax(const float a, const float b)
+{
+  return fmaxf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ float fmin(const float a, const float b)
+{
+  return fminf(a, b);
+}
+#endif /* !(((defined _GLIBCXX_MATH_H) && _GLIBCXX_MATH_H) && (__cplusplus >= 201103L)) */
+#endif /* !(!defined(__QNX__) && !(defined(_LIBCPP_VERSION) && _LIBCPP_VERSION >= 3800)) */
+#if defined(__QNX__) && defined(_LIBCPP_VERSION)
+_LIBCPP_END_NAMESPACE_STD
+#endif
+#endif /* __CUDACC_RTC__ || (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+__MATH_FUNCTIONS_DECL__ float exp10(const float a)
+{
+  return exp10f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float rsqrt(const float a)
+{
+  return rsqrtf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float rcbrt(const float a)
+{
+  return rcbrtf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float sinpi(const float a)
+{
+  return sinpif(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float cospi(const float a)
+{
+  return cospif(a);
+}
+
+__MATH_FUNCTIONS_DECL__ void sincospi(const float a, float *const sptr, float *const cptr)
+{
+  sincospif(a, sptr, cptr);
+}
+
+__MATH_FUNCTIONS_DECL__ void sincos(const float a, float *const sptr, float *const cptr)
+{
+  sincosf(a, sptr, cptr);
+}
+
+__MATH_FUNCTIONS_DECL__ float j0(const float a)
+{
+  return j0f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float j1(const float a)
+{
+  return j1f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float jn(const int n, const float a)
+{
+  return jnf(n, a);
+}
+
+__MATH_FUNCTIONS_DECL__ float y0(const float a)
+{
+  return y0f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float y1(const float a)
+{
+  return y1f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float yn(const int n, const float a)
+{ 
+  return ynf(n, a);
+}
+
+__MATH_FUNCTIONS_DEVICE_DECL__ float cyl_bessel_i0(const float a)
+{
+  return cyl_bessel_i0f(a);
+}
+
+__MATH_FUNCTIONS_DEVICE_DECL__ float cyl_bessel_i1(const float a)
+{
+  return cyl_bessel_i1f(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float erfinv(const float a)
+{
+  return erfinvf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float erfcinv(const float a)
+{
+  return erfcinvf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float normcdfinv(const float a)
+{
+  return normcdfinvf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float normcdf(const float a)
+{
+  return normcdff(a);
+}
+
+__MATH_FUNCTIONS_DECL__ float erfcx(const float a)
+{
+  return erfcxf(a);
+}
+
+__MATH_FUNCTIONS_DECL__ double copysign(const double a, const float b)
+{
+  return copysign(a, static_cast<double>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ double copysign(const float a, const double b)
+{
+  return copysign(static_cast<double>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int min(const unsigned int a, const unsigned int b)
+{
+  return umin(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int min(const int a, const unsigned int b)
+{
+  return umin(static_cast<unsigned int>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int min(const unsigned int a, const int b)
+{
+  return umin(a, static_cast<unsigned int>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ long int min(const long int a, const long int b)
+{
+  long int retval;
+  /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  /* long can be of 32-bit type on some systems. */
+  if (sizeof(long int) == sizeof(int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<long int>(min(static_cast<int>(a), static_cast<int>(b)));
+  } else {
+    retval = static_cast<long int>(llmin(static_cast<long long int>(a), static_cast<long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const unsigned long int a, const unsigned long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umin(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmin(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const long int a, const unsigned long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umin(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmin(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int min(const unsigned long int a, const long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umin(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmin(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ long long int min(const long long int a, const long long int b)
+{
+  return llmin(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const unsigned long long int a, const unsigned long long int b)
+{
+  return ullmin(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const long long int a, const unsigned long long int b)
+{
+  return ullmin(static_cast<unsigned long long int>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int min(const unsigned long long int a, const long long int b)
+{
+  return ullmin(a, static_cast<unsigned long long int>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ float min(const float a, const float b)
+{
+  return fminf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ double min(const double a, const double b)
+{
+  return fmin(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ double min(const float a, const double b)
+{
+  return fmin(static_cast<double>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ double min(const double a, const float b)
+{
+  return fmin(a, static_cast<double>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int max(const unsigned int a, const unsigned int b)
+{
+  return umax(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int max(const int a, const unsigned int b)
+{
+  return umax(static_cast<unsigned int>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned int max(const unsigned int a, const int b)
+{
+  return umax(a, static_cast<unsigned int>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ long int max(const long int a, const long int b)
+{
+  long int retval;
+  /* long can be of 32-bit type on some systems. */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(long int) == sizeof(int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<long int>(max(static_cast<int>(a), static_cast<int>(b)));
+  } else {
+    retval = static_cast<long int>(llmax(static_cast<long long int>(a), static_cast<long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const unsigned long int a, const unsigned long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umax(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmax(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const long int a, const unsigned long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umax(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmax(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long int max(const unsigned long int a, const long int b)
+{
+  unsigned long int retval;
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+  if (sizeof(unsigned long int) == sizeof(unsigned int)) {
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+    retval = static_cast<unsigned long int>(umax(static_cast<unsigned int>(a), static_cast<unsigned int>(b)));
+  } else {
+    retval = static_cast<unsigned long int>(ullmax(static_cast<unsigned long long int>(a), static_cast<unsigned long long int>(b)));
+  }
+  return retval;
+}
+
+__MATH_FUNCTIONS_DECL__ long long int max(const long long int a, const long long int b)
+{
+  return llmax(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const unsigned long long int a, const unsigned long long int b)
+{
+  return ullmax(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const long long int a, const unsigned long long int b)
+{
+  return ullmax(static_cast<unsigned long long int>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ unsigned long long int max(const unsigned long long int a, const long long int b)
+{
+  return ullmax(a, static_cast<unsigned long long int>(b));
+}
+
+__MATH_FUNCTIONS_DECL__ float max(const float a, const float b)
+{
+  return fmaxf(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ double max(const double a, const double b)
+{
+  return fmax(a, b);
+}
+
+__MATH_FUNCTIONS_DECL__ double max(const float a, const double b)
+{
+  return fmax(static_cast<double>(a), b);
+}
+
+__MATH_FUNCTIONS_DECL__ double max(const double a, const float b)
+{
+  return fmax(a, static_cast<double>(b));
+}
+
+
+#if !defined(__CUDA_ARCH__)
+#if defined(_WIN32)
+#define __HELPER_FUNC_LINKAGE static inline __host__ __device__
+#pragma warning (push)
+#pragma warning (disable : 4211)
+#else  /* !defined(_WIN32) */
+#define __HELPER_FUNC_LINKAGE inline __host__ __device__
+#endif  /* defined(_WIN32) */
+
+__HELPER_FUNC_LINKAGE int min(const int a, const int b)
+{
+  return (a < b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE unsigned int umin(const unsigned int a, const unsigned int b)
+{
+  return (a < b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE long long int llmin(const long long int a, const long long int b)
+{
+  return (a < b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE unsigned long long int ullmin(const unsigned long long int a,
+                                                    const unsigned long long int b)
+{
+  return (a < b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE int max(const int a, const int b)
+{
+  return (a > b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE unsigned int umax(const unsigned int a, const unsigned int b)
+{
+  return (a > b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE long long int llmax(const long long int a, const long long int b)
+{
+  return (a > b) ? a : b;
+}
+
+__HELPER_FUNC_LINKAGE unsigned long long int ullmax(const unsigned long long int a,
+                                                    const unsigned long long int b)
+{
+  return (a > b) ? a : b;
+}
+
+#if defined(_WIN32)
+#pragma warning (pop)
+#endif /* defined(_WIN32) */
+
+#undef __HELPER_FUNC_LINKAGE
+
+#endif /* !defined(__CUDA_ARCH__) */
+
+#undef __MATH_FUNCTIONS_DECL__
+#undef __MATH_FUNCTIONS_DEVICE_DECL__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#endif /* __cplusplus && __CUDACC__ */
+#if !defined(__CUDACC__)
+
+#include "host_defines.h"
+#include "math_constants.h"
+
+#define __cuda_INT_MAX \
+        ((int)((unsigned int)-1 >> 1))
+
+/*******************************************************************************
+*                                                                              *
+* ONLY FOR HOST CODE! NOT FOR DEVICE EXECUTION                                 *
+*                                                                              *
+*******************************************************************************/
+
+#include <crt/func_macro.h>
+
+#if defined(_WIN32)
+#pragma warning (push)
+#pragma warning (disable : 4211)
+
+#endif /* _WIN32 */
+
+#if defined(_WIN32) || defined(__APPLE__) || defined (__ANDROID__) || defined(__QNX__)
+
+__func__(int __isnan(const double a))
+{
+  unsigned long long int l;
+  memcpy(&l, &a, sizeof(double));
+  return (l << 1ULL) > 0xffe0000000000000ULL;
+}
+
+#endif /* _WIN32 || __APPLE__ || __ANDROID__ || __QNX__ */
+
+#if defined(_WIN32) || defined(__APPLE__) || defined(__QNX__)
+
+/*******************************************************************************
+*                                                                              *
+* HOST IMPLEMENTATION FOR DOUBLE ROUTINES FOR WINDOWS & APPLE PLATFORMS        *
+*                                                                              *
+*******************************************************************************/
+
+__func__(double exp10(const double a))
+{
+  return pow(10.0, a);
+}
+
+__func__(float exp10f(const float a))
+{
+    return static_cast<float>(exp10(static_cast<double>(a)));
+}
+
+__func__(void sincos(const double a, double *sptr, double *cptr))
+{
+  *sptr = sin(a);
+  *cptr = cos(a);
+}
+
+__func__(void sincosf(const float a, float *sptr, float *cptr))
+{
+  double s, c;
+
+  sincos(static_cast<double>(a), &s, &c);
+  *sptr = static_cast<float>(s);
+  *cptr = static_cast<float>(c);
+}
+
+__func__(int __isinf(const double a))
+{
+  unsigned long long int l;
+  memcpy(&l, &a, sizeof(double));
+  return (l << 1ULL) == 0xffe0000000000000ULL;
+}
+
+#endif /* _WIN32 || __APPLE__ */
+
+#if defined(_WIN32) || defined (__ANDROID__)
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+__func__(double log2(const double a))
+{
+  return log(a) * 1.44269504088896340;
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#endif /* _WIN32 || __ANDROID__ */
+
+#if defined(_WIN32)
+
+/*******************************************************************************
+*                                                                              *
+* HOST IMPLEMENTATION FOR DOUBLE ROUTINES FOR WINDOWS PLATFORM                 *
+*                                                                              *
+*******************************************************************************/
+
+__func__(int __signbit(double a))
+{
+  signed long long int l;
+  memcpy(&l, &a, sizeof(double));
+  return l < 0LL;
+}
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+__func__(double copysign(double a, double b))
+{
+  unsigned long long int la, lb;
+  memcpy(&la, &a, sizeof(double));
+  memcpy(&lb, &b, sizeof(double));
+  la = (la & 0x7fffffffffffffffULL) | (lb & 0x8000000000000000ULL);
+  memcpy(&a, &la, sizeof(double));
+  return a;
+}
+#endif /* MSC_VER < 1800 */
+
+__func__(int __finite(double a))
+{
+  unsigned long long int l;
+  memcpy(&l, &a, sizeof(double));
+  return (l << 1ULL) < 0xffe0000000000000ULL;
+}
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+__func__(double fmax(double a, double b))
+{
+  if (__isnan(a) && __isnan(b)) return a + b;
+  if (__isnan(a)) return b;
+  if (__isnan(b)) return a;
+  if ((a == 0.0) && (b == 0.0) && __signbit(b)) return a;
+  return a > b ? a : b;
+}
+
+__func__(double fmin(double a, double b))
+{
+  if (__isnan(a) && __isnan(b)) return a + b;
+  if (__isnan(a)) return b;
+  if (__isnan(b)) return a;
+  if ((a == 0.0) && (b == 0.0) && __signbit(a)) return a;
+  return a < b ? a : b;
+}
+
+__func__(double trunc(double a))
+{
+  return a < 0.0 ? ceil(a) : floor(a);
+}
+
+__func__(double round(double a))
+{
+  double fa = fabs(a);
+
+  if (fa > CUDART_TWO_TO_52) {
+    return a;
+  } else {
+    double u = floor(fa + 0.5);
+    if (fa < 0.5) u = 0;
+    u = copysign (u, a);
+    return u;
+  }
+}
+
+__func__(long int lround(double a))
+{
+  return static_cast<long int>(round(a));
+}
+
+__func__(long long int llround(double a))
+{
+  return static_cast<long long int>(round(a));
+}
+
+__func__(double rint(double a))
+{
+  double fa = fabs(a);
+  double u = CUDART_TWO_TO_52 + fa;
+  if (fa >= CUDART_TWO_TO_52) {
+    u = a;
+  } else {
+    u = u - CUDART_TWO_TO_52;
+    u = copysign (u, a);
+  }
+  return u;  
+}
+
+__func__(double nearbyint(double a))
+{
+  return rint(a);
+}
+
+__func__(long int lrint(double a))
+{
+  return static_cast<long int>(rint(a));
+}
+
+__func__(long long int llrint(double a))
+{
+  return static_cast<long long int>(rint(a));
+}
+
+__func__(double fdim(double a, double b))
+{
+  if (a > b) {
+    return (a - b);
+  } else if (a <= b) {
+    return 0.0;
+  } else if (__isnan(a)) {
+    return a;
+  } else {
+    return b;
+  }
+}
+
+__func__(double scalbn(double a, int b))
+{
+  return ldexp(a, b);
+}
+
+__func__(double scalbln(double a, long int b))
+{
+  int t;
+
+  if (b > 2147483647L) {
+    t = 2147483647;
+  } else if (b < (-2147483647 - 1)) {
+    t = (-2147483647 - 1);
+  } else {
+    t = static_cast<int>(b);
+  }
+  return scalbn(a, t);
+}
+
+__func__(double exp2(double a))
+{
+  return pow(2.0, a);
+}
+
+/*  
+ * The following is based on: David Goldberg, "What every computer scientist 
+ * should know about floating-point arithmetic", ACM Computing Surveys, Volume 
+ * 23, Issue 1, March 1991.
+ */
+__func__(double log1p(double a))
+{
+  volatile double u, m;
+
+  u = 1.0 + a;
+  if (u == 1.0) {
+    /* a very close to zero */
+    u = a;
+  } else {
+    m = u - 1.0;
+    u = log(u);
+    if (a < 1.0) {
+      /* a somewhat close to zero */
+      u = a * u;
+      u = u / m;
+    }
+  }
+  return u;
+}
+
+/*
+ * This code based on: http://www.cs.berkeley.edu/~wkahan/Math128/Sumnfp.pdf
+ */
+__func__(double expm1(double a))
+{
+  volatile double u, m;
+
+  u = exp(a);
+  m = u - 1.0;
+  if (m == 0.0) {
+    /* a very close zero */
+    m = a;
+  } 
+  else if (fabs(a) < 1.0) {
+    /* a somewhat close zero */
+    u = log(u);
+    m = m * a;
+    m = m / u;
+  }
+  return m;
+}
+
+__func__(double cbrt(double a))
+{
+  double s, t;
+
+  if (a == 0.0 || __isinf(a)) {
+    return a;
+  } 
+  s = fabs(a);
+  t = exp2(CUDART_THIRD * log2(s));           /* initial approximation */
+  t = t - (t - (s / (t * t))) * CUDART_THIRD; /* refine approximation */
+  t = copysign(t, a);
+  return t;
+}
+
+__func__(double acosh(double a))
+{
+  double s, t;
+
+  t = a - 1.0;
+  if (t == a) {
+    return log(2.0) + log(a);
+  } else {
+    s = a + 1.0;
+    t = t + sqrt(s * t);
+    return log1p(t);
+  }
+}
+
+__func__(double asinh(double a))
+{
+  double fa, oofa, t;
+
+  fa = fabs(a);
+  if (fa > 1e18) {
+    t = log(2.0) + log(fa);
+  } else {
+    oofa = 1.0 / fa;
+    t = fa + fa / (oofa + sqrt(1.0 + oofa * oofa));
+    t = log1p(t);
+  }
+  t = copysign(t, a);
+  return t;
+}
+
+__func__(double atanh(double a))
+{
+  double fa, t;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  fa = fabs(a);
+  t = (2.0 * fa) / (1.0 - fa);
+  t = 0.5 * log1p(t);
+  if (__isnan(t) || !__signbit(a)) {
+    return t;
+  }
+  return -t;
+}
+
+__func__(int ilogb(double a))
+{
+  unsigned long long int i;
+  int expo = -1022;
+
+  if (__isnan(a)) return -__cuda_INT_MAX-1;
+  if (__isinf(a)) return __cuda_INT_MAX;
+  memcpy(&i, &a, sizeof(double));
+  i = i & 0x7fffffffffffffffULL;
+  if (i == 0) return -__cuda_INT_MAX-1;
+  if (i >= 0x0010000000000000ULL) {
+    return (int)(((i >> 52ULL) & 0x7ffU) - 1023);
+  }
+  while (i < 0x0010000000000000ULL) {
+    expo--;
+    i <<= 1;
+  }
+  return expo;
+}
+
+__func__(double logb(double a))
+{
+  unsigned long long int i;
+  int expo = -1022;
+
+  if (__isnan(a)) return a + a;
+  if (__isinf(a)) return fabs(a);
+  memcpy(&i, &a, sizeof(double));
+  i = i & 0x7fffffffffffffffULL;
+  if (i == 0) return -1.0/fabs(a);
+  if (i >= 0x0010000000000000ULL) {
+    return (double)((int)((i >> 52ULL) & 0x7ffU) - 1023);
+  }
+  while (i < 0x0010000000000000ULL) {
+    expo--;
+    i <<= 1;
+  }
+  return static_cast<double>(expo);
+}
+
+__func__(double remquo(double a, double b, int *quo))
+{
+  unsigned long long int aa, bb;
+  int rem1 = 1; /* do FPREM1, a.k.a IEEE remainder */
+  int expo_a;
+  int expo_b;
+  unsigned long long mant_a;
+  unsigned long long mant_b;
+  unsigned long long mant_c;
+  unsigned long long temp;
+  int sign_a;
+  int sign_b;
+  int sign_c;
+  int expo_c;
+  int expodiff;
+  int quot = 0;                 /* initialize quotient */
+  int l;
+  int iter;
+
+  memcpy(&aa, &a, sizeof(double));
+  mant_a = (aa << 11ULL) | 0x8000000000000000ULL;
+  expo_a = (int)((aa >> 52ULL) & 0x7ffU) - 1023;
+  sign_a = (int)(aa >> 63ULL);
+
+  memcpy(&bb, &b, sizeof(double));
+  mant_b = (bb << 11ULL) | 0x8000000000000000ULL;
+  expo_b = (int)((bb >> 52ULL) & 0x7ffU) - 1023;
+  sign_b = (int)(bb >> 63ULL);
+
+  sign_c = sign_a;  /* remainder has sign of dividend */
+  expo_c = expo_a;  /* default */
+      
+  /* handled NaNs and infinities */
+  if (__isnan(a) || __isnan(b)) {
+    *quo = quot;
+    return a + b;
+  }
+  if (__isinf(a) || (b == 0.0)) {
+    *quo = quot;
+    aa = 0xfff8000000000000ULL;
+    memcpy(&a, &aa, sizeof(double));
+    return a;
+  }
+  if ((a == 0.0) || (__isinf(b))) {
+    *quo = quot;
+    return a;
+  }
+  /* normalize denormals */
+  if (expo_a < -1022) {
+    mant_a = mant_a + mant_a;
+    while (mant_a < 0x8000000000000000ULL) {
+      mant_a = mant_a + mant_a;
+      expo_a--;
+    }
+  } 
+  if (expo_b < -1022) {
+    mant_b = mant_b + mant_b;
+    while (mant_b < 0x8000000000000000ULL) {
+      mant_b = mant_b + mant_b;
+      expo_b--;
+    }
+  }
+  expodiff = expo_a - expo_b;
+  /* clamp iterations if exponent difference negative */
+  if (expodiff < 0) {
+    iter = -1;
+  } else {
+    iter = expodiff;
+  }
+  /* Shift dividend and divisor right by one bit to prevent overflow
+     during the division algorithm.
+   */
+  mant_a = mant_a >> 1ULL;
+  mant_b = mant_b >> 1ULL;
+  expo_c = expo_a - iter; /* default exponent of result   */
+
+  /* Use binary longhand division (restoring) */
+  for (l = 0; l < (iter + 1); l++) {
+    mant_a = mant_a - mant_b;
+    if (mant_a & 0x8000000000000000ULL) {
+      mant_a = mant_a + mant_b;
+      quot = quot + quot;
+    } else {
+      quot = quot + quot + 1;
+    }
+    mant_a = mant_a + mant_a;
+  }
+
+  /* Save current remainder */
+  mant_c = mant_a;
+  /* If remainder's mantissa is all zeroes, final result is zero. */
+  if (mant_c == 0) {
+    quot = quot & 7;
+    *quo = (sign_a ^ sign_b) ? -quot : quot;
+    aa = static_cast<unsigned long long int>(sign_c) << 63ULL;
+    memcpy(&a, &aa, sizeof(double));
+    return a;
+  }
+  /* Normalize result */
+  while (!(mant_c & 0x8000000000000000ULL)) {
+    mant_c = mant_c + mant_c;
+    expo_c--;
+  }
+  /* For IEEE remainder (quotient rounded to nearest-even we might need to 
+     do a final subtraction of the divisor from the remainder.
+  */
+  if (rem1 && ((expodiff+1) >= 0)) {
+    temp = mant_a - mant_b;
+    /* round quotient to nearest even */
+    if (((temp != 0ULL) && (!(temp & 0x8000000000000000ULL))) ||
+        ((temp == 0ULL) && (quot & 1))) {
+      mant_a = mant_a >> 1ULL;
+      quot++;
+      /* Since the divisor is greater than the remainder, the result will
+         have opposite sign of the dividend. To avoid a negative mantissa
+         when subtracting the divisor from remainder, reverse subtraction
+      */
+      sign_c = 1 ^ sign_c;
+      expo_c = expo_a - iter + 1;
+      mant_c = mant_b - mant_a;
+      /* normalize result */
+      while (!(mant_c & 0x8000000000000000ULL)) {
+        mant_c = mant_c + mant_c;
+        expo_c--;
+      }
+    }
+  }
+  /* package up result */
+  if (expo_c >= -1022) { /* normal */
+    mant_c = ((mant_c >> 11ULL) +
+              (((static_cast<unsigned long long>(sign_c)) << 63ULL) +
+               (((unsigned long long)(expo_c + 1022)) << 52ULL)));
+  } else { /* denormal */
+    mant_c = (((static_cast<unsigned long long>(sign_c)) << 63ULL) +
+              (mant_c >> (unsigned long long)(11 - expo_c - 1022)));
+  }
+  quot = quot & 7; /* mask quotient down to least significant three bits */
+  *quo = (sign_a ^ sign_b) ? -quot : quot;
+  memcpy(&a, &mant_c, sizeof(double));
+  return a;
+}
+
+__func__(double remainder(double a, double b))
+{
+  int quo;
+  return remquo (a, b, &quo);
+}
+
+__func__(double fma (double a, double b, double c))
+{
+  struct {
+    unsigned int lo;
+    unsigned int hi;
+  } xx, yy, zz, ww;
+  double d;
+  unsigned int s, t, u, prod0, prod1, prod2, prod3, expo_x, expo_y, expo_z;
+
+  memcpy(&xx, &a, sizeof(double));
+  memcpy(&yy, &b, sizeof(double));
+  memcpy(&zz, &c, sizeof(double));
+
+  expo_z = 0x7FFU;
+  t =  xx.hi >> 20;
+  expo_x = expo_z & t;
+  expo_x = expo_x - 1;    /* expo(x) - 1 */
+  t =  yy.hi >> 20;
+  expo_y = expo_z & t;
+  expo_y = expo_y - 1;    /* expo(y) - 1 */
+  t =  zz.hi >> 20;
+  expo_z = expo_z & t;
+  expo_z = expo_z - 1;    /* expo(z) - 1 */
+
+  if (!((expo_x <= 0x7FDU) &&
+        (expo_y <= 0x7FDU) &&
+        (expo_z <= 0x7FDU))) {
+    
+    /* fma (nan, y, z) --> nan
+       fma (x, nan, z) --> nan
+       fma (x, y, nan) --> nan 
+    */
+    if (((yy.hi << 1) | (yy.lo != 0)) > 0xffe00000U) {
+      yy.hi |= 0x00080000U;
+      memcpy(&d, &yy, sizeof(double));
+      return d;
+    }
+    if (((zz.hi << 1) | (zz.lo != 0)) > 0xffe00000U) {
+      zz.hi |= 0x00080000U;
+      memcpy(&d, &zz, sizeof(double));
+      return d;
+    }
+    if (((xx.hi << 1) | (xx.lo != 0)) > 0xffe00000U) {
+      xx.hi |= 0x00080000U;
+      memcpy(&d, &xx, sizeof(double));
+      return d;
+    }
+    
+    /* fma (0, inf, z) --> INDEFINITE
+       fma (inf, 0, z) --> INDEFINITE
+       fma (-inf,+y,+inf) --> INDEFINITE
+       fma (+x,-inf,+inf) --> INDEFINITE
+       fma (+inf,-y,+inf) --> INDEFINITE
+       fma (-x,+inf,+inf) --> INDEFINITE
+       fma (-inf,-y,-inf) --> INDEFINITE
+       fma (-x,-inf,-inf) --> INDEFINITE
+       fma (+inf,+y,-inf) --> INDEFINITE
+       fma (+x,+inf,-inf) --> INDEFINITE
+    */
+    if (((((xx.hi << 1) | xx.lo) == 0) && 
+         (((yy.hi << 1) | (yy.lo != 0)) == 0xffe00000U)) ||
+        ((((yy.hi << 1) | yy.lo) == 0) && 
+         (((xx.hi << 1) | (xx.lo != 0)) == 0xffe00000U))) {
+      xx.hi = 0xfff80000U;
+      xx.lo = 0x00000000U;
+      memcpy(&d, &xx, sizeof(double));
+      return d;
+    }
+    if (((zz.hi << 1) | (zz.lo != 0)) == 0xffe00000U) {
+      if ((((yy.hi << 1) | (yy.lo != 0)) == 0xffe00000U) ||
+          (((xx.hi << 1) | (xx.lo != 0)) == 0xffe00000U)) {
+        if ((int)(xx.hi ^ yy.hi ^ zz.hi) < 0) {
+          xx.hi = 0xfff80000U;
+          xx.lo = 0x00000000U;
+          memcpy(&d, &xx, sizeof(double));
+          return d;
+        }
+      }
+    }
+    /* fma (inf, y, z) --> inf
+       fma (x, inf, z) --> inf
+       fma (x, y, inf) --> inf
+    */
+    if (((xx.hi << 1) | (xx.lo != 0)) == 0xffe00000U) {
+      xx.hi = xx.hi ^ (yy.hi & 0x80000000U);
+      memcpy(&d, &xx, sizeof(double));
+      return d;
+    }
+    if (((yy.hi << 1) | (yy.lo != 0)) == 0xffe00000U) {
+      yy.hi = yy.hi ^ (xx.hi & 0x80000000U);
+      memcpy(&d, &yy, sizeof(double));
+      return d;
+    }
+    if (((zz.hi << 1) | (zz.lo != 0)) == 0xffe00000U) {
+      memcpy(&d, &zz, sizeof(double));
+      return d;
+    }
+    /* fma (+0, -y, -0) --> -0
+       fma (-0, +y, -0) --> -0
+       fma (+x, -0, -0) --> -0
+       fma (-x, +0, -0) --> -0
+    */
+    if ((zz.hi == 0x80000000U) && (zz.lo == 0)) {
+      if ((((xx.hi << 1) | xx.lo) == 0) ||
+          (((yy.hi << 1) | yy.lo) == 0)) {
+        if ((int)(xx.hi ^ yy.hi) < 0) {
+          memcpy(&d, &zz, sizeof(double));
+          return d;
+        }
+      }
+    }
+    /* fma (0, y, 0) --> +0  (-0 if round down and signs of addend differ)
+       fma (x, 0, 0) --> +0  (-0 if round down and signs of addend differ)
+    */
+    if ((((zz.hi << 1) | zz.lo) == 0) &&
+        ((((xx.hi << 1) | xx.lo) == 0) ||
+         (((yy.hi << 1) | yy.lo) == 0))) {
+      zz.hi &= 0x7fffffffU;
+      memcpy(&d, &zz, sizeof(double));
+      return d;
+    }
+    
+    /* fma (0, y, z) --> z
+       fma (x, 0, z) --> z
+    */
+    if ((((xx.hi << 1) | xx.lo) == 0) ||
+        (((yy.hi << 1) | yy.lo) == 0)) {
+      memcpy(&d, &zz, sizeof(double));
+      return d;
+    }
+    
+    if (expo_x == 0xffffffffU) {
+      expo_x++;
+      t = xx.hi & 0x80000000U;
+      s = xx.lo >> 21;
+      xx.lo = xx.lo << 11;
+      xx.hi = xx.hi << 11;
+      xx.hi = xx.hi | s;
+      if (!xx.hi) {
+        xx.hi = xx.lo;
+        xx.lo = 0;
+        expo_x -= 32;
+      }
+      while (static_cast<int>(xx.hi) > 0) {
+        s = xx.lo >> 31;
+        xx.lo = xx.lo + xx.lo;
+        xx.hi = xx.hi + xx.hi;
+        xx.hi = xx.hi | s;
+        expo_x--;
+      }
+      xx.lo = (xx.lo >> 11);
+      xx.lo |= (xx.hi << 21);
+      xx.hi = (xx.hi >> 11) | t;
+    }
+    if (expo_y == 0xffffffffU) {
+      expo_y++;
+      t = yy.hi & 0x80000000U;
+      s = yy.lo >> 21;
+      yy.lo = yy.lo << 11;
+      yy.hi = yy.hi << 11;
+      yy.hi = yy.hi | s;
+      if (!yy.hi) {
+        yy.hi = yy.lo;
+        yy.lo = 0;
+        expo_y -= 32;
+      }
+      while (static_cast<int>(yy.hi) > 0) {
+        s = yy.lo >> 31;
+        yy.lo = yy.lo + yy.lo;
+        yy.hi = yy.hi + yy.hi;
+        yy.hi = yy.hi | s;
+        expo_y--;
+      }
+      yy.lo = (yy.lo >> 11);
+      yy.lo |= (yy.hi << 21);
+      yy.hi = (yy.hi >> 11) | t;
+    }
+    if (expo_z == 0xffffffffU) {
+      expo_z++;
+      t = zz.hi & 0x80000000U;
+      s = zz.lo >> 21;
+      zz.lo = zz.lo << 11;
+      zz.hi = zz.hi << 11;
+      zz.hi = zz.hi | s;
+      if (!zz.hi) {
+        zz.hi = zz.lo;
+        zz.lo = 0;
+        expo_z -= 32;
+      }
+      while (static_cast<int>(zz.hi) > 0) {
+        s = zz.lo >> 31;
+        zz.lo = zz.lo + zz.lo;
+        zz.hi = zz.hi + zz.hi;
+        zz.hi = zz.hi | s;
+        expo_z--;
+      }
+      zz.lo = (zz.lo >> 11);
+      zz.lo |= (zz.hi << 21);
+      zz.hi = (zz.hi >> 11) | t;
+    }
+  }
+  
+  expo_x = expo_x + expo_y;
+  expo_y = xx.hi ^ yy.hi;
+  t = xx.lo >> 21;
+  xx.lo = xx.lo << 11;
+  xx.hi = xx.hi << 11;
+  xx.hi = xx.hi | t;
+  yy.hi = yy.hi & 0x000fffffU;
+  xx.hi = xx.hi | 0x80000000U; /* set mantissa hidden bit */
+  yy.hi = yy.hi | 0x00100000U; /* set mantissa hidden bit */
+
+  prod0 = xx.lo * yy.lo;
+  prod1 =(unsigned)((static_cast<unsigned long long>(xx.lo)*static_cast<unsigned long long>(yy.lo))>>32ULL);
+  prod2 = xx.hi * yy.lo;
+  prod3 = xx.lo * yy.hi;
+  prod1 += prod2;
+  t = (unsigned)(prod1 < prod2);
+  prod1 += prod3;
+  t += prod1 < prod3;
+  prod2 =(unsigned)((static_cast<unsigned long long>(xx.hi)*static_cast<unsigned long long>(yy.lo))>>32ULL);
+  prod3 =(unsigned)((static_cast<unsigned long long>(xx.lo)*static_cast<unsigned long long>(yy.hi))>>32ULL);
+  prod2 += prod3;
+  s = (unsigned)(prod2 < prod3);
+  prod3 = xx.hi * yy.hi;
+  prod2 += prod3;
+  s += prod2 < prod3;
+  prod2 += t;
+  s += prod2 < t;
+  prod3 =(unsigned)((static_cast<unsigned long long>(xx.hi)*static_cast<unsigned long long>(yy.hi))>>32ULL);
+  prod3 = prod3 + s;
+  
+  yy.lo = prod0;                 /* mantissa */
+  yy.hi = prod1;                 /* mantissa */
+  xx.lo = prod2;                 /* mantissa */
+  xx.hi = prod3;                 /* mantissa */
+  expo_x = expo_x - (1023 - 2);  /* expo-1 */
+  expo_y = expo_y & 0x80000000U;  /* sign */
+
+  if (xx.hi < 0x00100000U) {
+    s = xx.lo >> 31;
+    s = (xx.hi << 1) + s;
+    xx.hi = s;
+    s = yy.hi >> 31;
+    s = (xx.lo << 1) + s;
+    xx.lo = s;
+    s = yy.lo >> 31;
+    s = (yy.hi << 1) + s;
+    yy.hi = s;
+    s = yy.lo << 1;
+    yy.lo = s;
+    expo_x--;
+  }
+
+  t = 0;
+  if (((zz.hi << 1) | zz.lo) != 0) { /* z is not zero */
+    
+    s = zz.hi & 0x80000000U;
+    
+    zz.hi &= 0x000fffffU;
+    zz.hi |= 0x00100000U;
+    ww.hi = 0;
+    ww.lo = 0;
+    
+    /* compare and swap. put augend into xx:yy */
+    if (static_cast<int>(expo_z) > static_cast<int>(expo_x)) {
+      t = expo_z;
+      expo_z = expo_x;
+      expo_x = t;
+      t = zz.hi;
+      zz.hi = xx.hi;
+      xx.hi = t;
+      t = zz.lo;
+      zz.lo = xx.lo;
+      xx.lo = t;
+      t = ww.hi;
+      ww.hi = yy.hi;
+      yy.hi = t;
+      t = ww.lo;
+      ww.lo = yy.lo;
+      yy.lo = t;
+      t = expo_y;
+      expo_y = s;
+      s = t;
+    }
+    
+    /* augend_sign = expo_y, augend_mant = xx:yy, augend_expo = expo_x */
+    /* addend_sign = s, addend_mant = zz:ww, addend_expo = expo_z */
+    expo_z = expo_x - expo_z;
+    u = expo_y ^ s;
+    if (expo_z <= 107) {
+      /* denormalize addend */
+      t = 0;
+      while (expo_z >= 32) {
+        t     = ww.lo | (t != 0);
+        ww.lo = ww.hi;
+        ww.hi = zz.lo;
+        zz.lo = zz.hi;
+        zz.hi = 0;
+        expo_z -= 32;
+      }
+      if (expo_z) {
+        t     = (t     >> expo_z) | (ww.lo << (32 - expo_z)) | 
+                ((t << (32 - expo_z)) != 0);
+        ww.lo = (ww.lo >> expo_z) | (ww.hi << (32 - expo_z));
+        ww.hi = (ww.hi >> expo_z) | (zz.lo << (32 - expo_z));
+        zz.lo = (zz.lo >> expo_z) | (zz.hi << (32 - expo_z));
+        zz.hi = (zz.hi >> expo_z);
+      }
+    } else {
+      t = 1;
+      ww.lo = 0;
+      ww.hi = 0;
+      zz.lo = 0;
+      zz.hi = 0;
+    }
+    if (static_cast<int>(u) < 0) {
+      /* signs differ, effective subtraction */
+      t = (unsigned)(-static_cast<int>(t));
+      s = (unsigned)(t != 0);
+      u = yy.lo - s;
+      s = (unsigned)(u > yy.lo);
+      yy.lo = u - ww.lo;
+      s += yy.lo > u;
+      u = yy.hi - s;
+      s = (unsigned)(u > yy.hi);
+      yy.hi = u - ww.hi;
+      s += yy.hi > u;
+      u = xx.lo - s;
+      s = (unsigned)(u > xx.lo);
+      xx.lo = u - zz.lo;
+      s += xx.lo > u;
+      xx.hi = (xx.hi - zz.hi) - s;
+      if (!(xx.hi | xx.lo | yy.hi | yy.lo | t)) {
+        /* complete cancelation, return 0 */
+        memcpy(&d, &xx, sizeof(double));
+        return d;
+      }
+      if (static_cast<int>(xx.hi) < 0) {
+        /* Oops, augend had smaller mantissa. Negate mantissa and flip
+           sign of result
+        */
+        t = ~t;
+        yy.lo = ~yy.lo;
+        yy.hi = ~yy.hi;
+        xx.lo = ~xx.lo;
+        xx.hi = ~xx.hi;
+        if (++t == 0) {
+          if (++yy.lo == 0) {
+            if (++yy.hi == 0) {
+              if (++xx.lo == 0) {
+              ++xx.hi;
+              }
+            }
+          }
+        }
+        expo_y ^= 0x80000000U;
+      }
+        
+      /* normalize mantissa, if necessary */
+      while (!(xx.hi & 0x00100000U)) {
+        xx.hi = (xx.hi << 1) | (xx.lo >> 31);
+        xx.lo = (xx.lo << 1) | (yy.hi >> 31);
+        yy.hi = (yy.hi << 1) | (yy.lo >> 31);
+        yy.lo = (yy.lo << 1);
+        expo_x--;
+      }
+    } else {
+      /* signs are the same, effective addition */
+      yy.lo = yy.lo + ww.lo;
+      s = (unsigned)(yy.lo < ww.lo);
+      yy.hi = yy.hi + s;
+      u = (unsigned)(yy.hi < s);
+      yy.hi = yy.hi + ww.hi;
+      u += yy.hi < ww.hi;
+      xx.lo = xx.lo + u;
+      s = (unsigned)(xx.lo < u);
+      xx.lo = xx.lo + zz.lo;
+      s += xx.lo < zz.lo;
+      xx.hi = xx.hi + zz.hi + s;
+      if (xx.hi & 0x00200000U) {
+        t = t | (yy.lo << 31);
+        yy.lo = (yy.lo >> 1) | (yy.hi << 31);
+        yy.hi = (yy.hi >> 1) | (xx.lo << 31);
+        xx.lo = (xx.lo >> 1) | (xx.hi << 31);
+        xx.hi = ((xx.hi & 0x80000000U) | (xx.hi >> 1)) & ~0x40000000U;
+        expo_x++;
+      }
+    }
+  }
+  t = yy.lo | (t != 0);
+  t = yy.hi | (t != 0);
+        
+  xx.hi |= expo_y; /* or in sign bit */
+  if (expo_x <= 0x7FDU) {
+    /* normal */
+    xx.hi = xx.hi & ~0x00100000U; /* lop off integer bit */
+    s = xx.lo & 1; /* mantissa lsb */
+    u = xx.lo;
+    xx.lo += (t == 0x80000000U) ? s : (t >> 31);
+    xx.hi += (u > xx.lo);
+    xx.hi += ((expo_x + 1) << 20);
+    memcpy(&d, &xx, sizeof(double));
+    return d;
+  } else if (static_cast<int>(expo_x) >= 2046) {
+    /* overflow */
+    xx.hi = (xx.hi & 0x80000000U) | 0x7ff00000U;
+    xx.lo = 0;
+    memcpy(&d, &xx, sizeof(double));
+    return d;
+  }
+  /* subnormal */
+  expo_x = (unsigned)(-static_cast<int>(expo_x));
+  if (expo_x > 54) {
+    xx.hi = xx.hi & 0x80000000U;
+    xx.lo = 0;
+    memcpy(&d, &xx, sizeof(double));
+    return d;
+  }  
+  yy.hi = xx.hi &  0x80000000U;   /* save sign bit */
+  xx.hi = xx.hi & ~0xffe00000U;
+  if (expo_x >= 32) {
+    t = xx.lo | (t != 0);
+    xx.lo = xx.hi;
+    xx.hi = 0;
+    expo_x -= 32;
+  }
+  if (expo_x) {
+    t     = (t     >> expo_x) | (xx.lo << (32 - expo_x)) | (t != 0);
+    xx.lo = (xx.lo >> expo_x) | (xx.hi << (32 - expo_x));
+    xx.hi = (xx.hi >> expo_x);
+  }
+  expo_x = xx.lo & 1;
+  u = xx.lo;
+  xx.lo += (t == 0x80000000U) ? expo_x : (t >> 31);
+  xx.hi += (u > xx.lo);
+  xx.hi |= yy.hi;
+  memcpy(&d, &xx, sizeof(double));
+  return d;
+}
+
+__func__(double nextafter(double a, double b))
+{
+  unsigned long long int ia;
+  unsigned long long int ib;
+  memcpy(&ia, &a, sizeof(double));
+  memcpy(&ib, &b, sizeof(double));
+  if (__isnan(a) || __isnan(b)) return a + b; /* NaN */
+  if (((ia | ib) << 1ULL) == 0ULL) return b;
+  if (a == 0.0) {
+    return copysign (4.9406564584124654e-324, b); /* crossover */
+  }
+  if ((a < b) && (a < 0.0)) ia--;
+  if ((a < b) && (a > 0.0)) ia++;
+  if ((a > b) && (a < 0.0)) ia++;
+  if ((a > b) && (a > 0.0)) ia--;
+  memcpy(&a, &ia, sizeof(double));
+  return a;
+}
+
+__func__(double erf(double a))
+{
+  double t, r, q;
+
+  t = fabs(a);
+  if (t >= 1.0) {
+    r =        -1.28836351230756500E-019;
+    r = r * t + 1.30597472161093370E-017;
+    r = r * t - 6.33924401259620500E-016;
+    r = r * t + 1.96231865908940140E-014;
+    r = r * t - 4.35272243559990750E-013;
+    r = r * t + 7.37083927929352150E-012;
+    r = r * t - 9.91402142550461630E-011;
+    r = r * t + 1.08817017167760820E-009;
+    r = r * t - 9.93918713097634620E-009;
+    r = r * t + 7.66739923255145500E-008;
+    r = r * t - 5.05440278302806720E-007;
+    r = r * t + 2.87474157099000620E-006;
+    r = r * t - 1.42246725399722510E-005;
+    r = r * t + 6.16994555079419460E-005;
+    r = r * t - 2.36305221938908790E-004;
+    r = r * t + 8.05032844055371070E-004;
+    r = r * t - 2.45833366629108140E-003;
+    r = r * t + 6.78340988296706120E-003;
+    r = r * t - 1.70509103597554640E-002;
+    r = r * t + 3.93322852515666300E-002;
+    r = r * t - 8.37271292613764040E-002;
+    r = r * t + 1.64870423707623280E-001;
+    r = r * t - 2.99729521787681470E-001;
+    r = r * t + 4.99394435612628580E-001;
+    r = r * t - 7.52014596480123030E-001;
+    r = r * t + 9.99933138314926250E-001;
+    r = r * t - 1.12836725321102670E+000;
+    r = r * t + 9.99998988715182450E-001;
+    q = exp (-t * t);
+    r = 1.0 - r * q;
+    if (t >= 6.5) {
+      r = 1.0;
+    }    
+    a = copysign (r, a);
+  } else {
+    q = a * a;
+    r =        -7.77946848895991420E-010;
+    r = r * q + 1.37109803980285950E-008;
+    r = r * q - 1.62063137584932240E-007;
+    r = r * q + 1.64471315712790040E-006;
+    r = r * q - 1.49247123020098620E-005;
+    r = r * q + 1.20552935769006260E-004;
+    r = r * q - 8.54832592931448980E-004;
+    r = r * q + 5.22397760611847340E-003;
+    r = r * q - 2.68661706431114690E-002;
+    r = r * q + 1.12837916709441850E-001;
+    r = r * q - 3.76126389031835210E-001;
+    r = r * q + 1.12837916709551260E+000;
+    a = r * a;
+  }
+  return a;
+}
+
+__func__(double erfc(double a))
+{
+  double p, q, h, l;
+
+  if (a < 0.75) {
+    return 1.0 - erf(a);
+  } 
+  if (a > 27.3) {
+    return 0.0;
+  }
+  if (a < 5.0) {
+    double t;
+    t = 1.0 / a;
+    p =         1.9759923722227928E-008;
+    p = p * t - 1.0000002670474897E+000;
+    p = p * t - 7.4935303236347828E-001;
+    p = p * t - 1.5648136328071860E-001;
+    p = p * t + 1.2871196242447239E-001;
+    p = p * t + 1.1126459974811195E-001;
+    p = p * t + 4.0678642255914332E-002;
+    p = p * t + 7.9915414156678296E-003;
+    p = p * t + 7.1458332107840234E-004;
+    q =     t + 2.7493547525030619E+000;
+    q = q * t + 3.3984254815725423E+000;
+    q = q * t + 2.4635304979947761E+000;
+    q = q * t + 1.1405284734691286E+000;
+    q = q * t + 3.4130157606195649E-001;
+    q = q * t + 6.2250967676044953E-002;
+    q = q * t + 5.5661370941268700E-003;
+    q = q * t + 1.0575248365468671E-009;
+    p = p / q;
+    p = p * t;
+    h = ((int)(a * 16.0)) * 0.0625;
+    l = (a - h) * (a + h);
+    q = exp(-h * h) * exp(-l);
+    q = q * 0.5;
+    p = p * q + q;
+    p = p * t;
+  } else {
+    double ooa, ooasq;
+
+    ooa = 1.0 / a;
+    ooasq = ooa * ooa;
+    p =            -4.0025406686930527E+005;
+    p = p * ooasq + 1.4420582543942123E+005;
+    p = p * ooasq - 2.7664185780951841E+004;
+    p = p * ooasq + 4.1144611644767283E+003;
+    p = p * ooasq - 5.8706000519209351E+002;
+    p = p * ooasq + 9.1490086446323375E+001;
+    p = p * ooasq - 1.6659491387740221E+001;
+    p = p * ooasq + 3.7024804085481784E+000;
+    p = p * ooasq - 1.0578553994424316E+000;
+    p = p * ooasq + 4.2314218745087778E-001;
+    p = p * ooasq - 2.8209479177354962E-001;
+    p = p * ooasq + 5.6418958354775606E-001;
+    h = a * a;
+    h = ((int)(a * 16.0)) * 0.0625;
+    l = (a - h) * (a + h);
+    q = exp(-h * h) * exp(-l);
+    p = p * ooa;
+    p = p * q;
+  }
+  return p;
+}
+
+__func__(double lgamma(double a))
+{
+  double s;
+  double t;
+  double i;
+  double fa;
+  double sum;
+  long long int quot;
+  if (__isnan(a) || __isinf(a)) {
+    return a * a;
+  }
+  fa = fabs(a);
+  if (fa >= 3.0) {
+    if (fa >= 8.0) {
+      /* Stirling approximation; coefficients from Hart et al, "Computer 
+       * Approximations", Wiley 1968. Approximation 5404. 
+       */
+      s = 1.0 / fa;
+      t = s * s;
+      sum =          -0.1633436431e-2;
+      sum = sum * t + 0.83645878922e-3;
+      sum = sum * t - 0.5951896861197e-3;
+      sum = sum * t + 0.793650576493454e-3;
+      sum = sum * t - 0.277777777735865004e-2;
+      sum = sum * t + 0.833333333333331018375e-1;
+      sum = sum * s + 0.918938533204672;
+      s = 0.5 * log (fa);
+      t = fa - 0.5;
+      s = s * t;
+      t = s - fa;
+      s = s + sum;
+      t = t + s;
+    } else {
+      i = fa - 3.0;
+      s =        -4.02412642744125560E+003;
+      s = s * i - 2.97693796998962000E+005;
+      s = s * i - 6.38367087682528790E+006;
+      s = s * i - 5.57807214576539320E+007;
+      s = s * i - 2.24585140671479230E+008;
+      s = s * i - 4.70690608529125090E+008;
+      s = s * i - 7.62587065363263010E+008;
+      s = s * i - 9.71405112477113250E+008;
+      t =     i - 1.02277248359873170E+003;
+      t = t * i - 1.34815350617954480E+005;
+      t = t * i - 4.64321188814343610E+006;
+      t = t * i - 6.48011106025542540E+007;
+      t = t * i - 4.19763847787431360E+008;
+      t = t * i - 1.25629926018000720E+009;
+      t = t * i - 1.40144133846491690E+009;
+      t = s / t;
+      t = t + i;
+    }
+  } else if (fa >= 1.5) {
+    i = fa - 2.0;
+    t =         9.84839283076310610E-009;
+    t = t * i - 6.69743850483466500E-008;
+    t = t * i + 2.16565148880011450E-007;
+    t = t * i - 4.86170275781575260E-007;
+    t = t * i + 9.77962097401114400E-007;
+    t = t * i - 2.03041287574791810E-006;
+    t = t * i + 4.36119725805364580E-006;
+    t = t * i - 9.43829310866446590E-006;
+    t = t * i + 2.05106878496644220E-005;
+    t = t * i - 4.49271383742108440E-005;
+    t = t * i + 9.94570466342226000E-005;
+    t = t * i - 2.23154589559238440E-004;
+    t = t * i + 5.09669559149637430E-004;
+    t = t * i - 1.19275392649162300E-003;
+    t = t * i + 2.89051032936815490E-003;
+    t = t * i - 7.38555102806811700E-003;
+    t = t * i + 2.05808084278121250E-002;
+    t = t * i - 6.73523010532073720E-002;
+    t = t * i + 3.22467033424113040E-001;
+    t = t * i + 4.22784335098467190E-001;
+    t = t * i;
+  } else if (fa >= 0.7) {
+    i = 1.0 - fa;
+    t =         1.17786911519331130E-002;  
+    t = t * i + 3.89046747413522300E-002;
+    t = t * i + 5.90045711362049900E-002;
+    t = t * i + 6.02143305254344420E-002;
+    t = t * i + 5.61652708964839180E-002;
+    t = t * i + 5.75052755193461370E-002;
+    t = t * i + 6.21061973447320710E-002;
+    t = t * i + 6.67614724532521880E-002;
+    t = t * i + 7.14856037245421020E-002;
+    t = t * i + 7.69311251313347100E-002;
+    t = t * i + 8.33503129714946310E-002;
+    t = t * i + 9.09538288991182800E-002;
+    t = t * i + 1.00099591546322310E-001;
+    t = t * i + 1.11334278141734510E-001;
+    t = t * i + 1.25509666613462880E-001;
+    t = t * i + 1.44049896457704160E-001;
+    t = t * i + 1.69557177031481600E-001;
+    t = t * i + 2.07385551032182120E-001;
+    t = t * i + 2.70580808427600350E-001;
+    t = t * i + 4.00685634386517050E-001;
+    t = t * i + 8.22467033424113540E-001;
+    t = t * i + 5.77215664901532870E-001;
+    t = t * i;
+  } else {
+    t =         -9.04051686831357990E-008;
+    t = t * fa + 7.06814224969349250E-007;
+    t = t * fa - 3.80702154637902830E-007;
+    t = t * fa - 2.12880892189316100E-005;
+    t = t * fa + 1.29108470307156190E-004;
+    t = t * fa - 2.15932815215386580E-004;
+    t = t * fa - 1.16484324388538480E-003;
+    t = t * fa + 7.21883433044470670E-003;
+    t = t * fa - 9.62194579514229560E-003;
+    t = t * fa - 4.21977386992884450E-002;
+    t = t * fa + 1.66538611813682460E-001;
+    t = t * fa - 4.20026350606819980E-002;
+    t = t * fa - 6.55878071519427450E-001;
+    t = t * fa + 5.77215664901523870E-001;
+    t = t * fa;
+    t = t * fa + fa;
+    t = -log (t);
+  }
+  if (a >= 0.0) return t;
+  if (fa < 1e-19) return -log(fa);
+  i = floor(fa);       
+  if (fa == i) return 1.0 / (fa - i); /* a is an integer: return infinity */
+  i = rint (2.0 * fa);
+  quot = static_cast<long long int>(i);
+  i = fa - 0.5 * i;
+  i = i * CUDART_PI;
+  if (quot & 1) {
+    i = cos(i);
+  } else {
+    i = sin(i);
+  }
+  i = fabs(i);
+  t = log(CUDART_PI / (i * fa)) - t;
+  return t;
+}
+
+__func__(unsigned long long int __internal_host_nan_kernel(const char *s))
+{
+  unsigned long long i = 0;
+  int c;
+  int ovfl = 0;
+  int invld = 0;
+  if (s && (*s == '0')) {
+    s++;
+    if ((*s == 'x') || (*s == 'X')) {
+      s++; 
+      while (*s == '0') s++;
+      while (*s) {
+        if (i > 0x0fffffffffffffffULL) {
+          ovfl = 1;
+        }
+        c = (((*s) >= 'A') && ((*s) <= 'F')) ? (*s + 'a' - 'A') : (*s);
+        if ((c >= 'a') && (c <= 'f')) { 
+          c = c - 'a' + 10;
+          i = i * 16 + c;
+        } else if ((c >= '0') && (c <= '9')) { 
+          c = c - '0';
+          i = i * 16 + c;
+        } else {
+          invld = 1;
+        }
+        s++;
+      }
+    } else {
+      while (*s == '0') s++;
+      while (*s) {
+        if (i > 0x1fffffffffffffffULL) {
+          ovfl = 1;
+        }
+        c = *s;
+        if ((c >= '0') && (c <= '7')) { 
+          c = c - '0';
+          i = i * 8 + c;
+        } else {
+          invld = 1; 
+        }
+        s++;
+      }
+    }
+  } else if (s) {
+    while (*s) {
+      c = *s;
+      if ((i > 1844674407370955161ULL) || 
+          ((i == 1844674407370955161ULL) && (c > '5'))) {
+        ovfl = 1;
+      }
+      if ((c >= '0') && (c <= '9')) { 
+        c = c - '0';
+        i = i * 10 + c;
+      } else {
+        invld = 1;
+      }
+      s++;
+    }
+  }
+  if (ovfl) {
+    i = ~0ULL;
+  }
+  if (invld) {
+    i = 0ULL;
+  }
+  i = (i & 0x000fffffffffffffULL) | 0x7ff8000000000000ULL;
+  return i;
+}
+
+__func__(double nan(const char *tagp))
+{
+  unsigned long long l;
+  double d;
+  l = __internal_host_nan_kernel(tagp);
+  memcpy(&d, &l, sizeof(double));
+  return d;
+}
+
+__func__(double __host_tgamma_kernel(double a))
+{
+  double t;
+  t =       - 4.4268934071252475E-010;
+  t = t * a - 2.0266591846658954E-007;
+  t = t * a + 1.1381211721119527E-006;
+  t = t * a - 1.2507734816630748E-006;
+  t = t * a - 2.0136501740408771E-005;
+  t = t * a + 1.2805012607354486E-004;
+  t = t * a - 2.1524140811527418E-004;
+  t = t * a - 1.1651675459704604E-003;
+  t = t * a + 7.2189432248466381E-003;
+  t = t * a - 9.6219715326862632E-003;
+  t = t * a - 4.2197734554722394E-002;
+  t = t * a + 1.6653861138250356E-001;
+  t = t * a - 4.2002635034105444E-002;
+  t = t * a - 6.5587807152025712E-001;
+  t = t * a + 5.7721566490153287E-001;
+  t = t * a + 1.0000000000000000E+000;
+  return t;
+}
+
+__func__(double __host_stirling_poly(double a))
+{
+  double x = 1.0 / a;
+  double z = 0.0;
+  z =       + 8.3949872067208726e-004;
+  z = z * x - 5.1717909082605919e-005;
+  z = z * x - 5.9216643735369393e-004;
+  z = z * x + 6.9728137583658571e-005;
+  z = z * x + 7.8403922172006662e-004;
+  z = z * x - 2.2947209362139917e-004;
+  z = z * x - 2.6813271604938273e-003;
+  z = z * x + 3.4722222222222220e-003;
+  z = z * x + 8.3333333333333329e-002;
+  z = z * x + 1.0000000000000000e+000;
+  return z;
+}
+
+__func__(double __host_tgamma_stirling(double a))
+{
+  double z;
+  double x;
+  z = __host_stirling_poly (a);
+  if (a < 142.0) {
+    x = pow (a, a - 0.5);
+    a = x * exp (-a);
+    a = a * CUDART_SQRT_2PI;
+    return a * z;
+  } else if (a < 172.0) {
+    x = pow (a, 0.5 * a - 0.25);
+    a = x * exp (-a);
+    a = a * CUDART_SQRT_2PI;
+    a = a * z;
+    return a * x;
+  } else {
+    return exp(1000.0); /* INF */
+  }
+}
+
+__func__(double tgamma(double a))
+{
+  double s, xx, x = a;
+  if (__isnan(a)) {
+    return a + a;
+  }
+  if (fabs(x) < 20.0) {
+    if (x >= 0.0) {
+      s = 1.0;
+      xx = x;
+      while (xx > 1.5) {
+        xx = xx - 1.0;
+        s = s * xx;
+      }
+      if (x >= 0.5) {
+        xx = xx - 1.0;
+      }
+      xx = __host_tgamma_kernel (xx);
+      if (x < 0.5) {
+        xx = xx * x;
+      }
+      s = s / xx;
+    } else {
+      xx = x;
+      s = xx;
+      if (x == floor(x)) {
+        return 0.0 / (x - floor(x));
+      }
+      while (xx < -0.5) {
+        xx = xx + 1.0;
+        s = s * xx;
+      }
+      xx = __host_tgamma_kernel (xx);
+      s = s * xx;
+      s = 1.0 / s;
+    }
+    return s;
+  } else {
+    if (x >= 0.0) {
+      return __host_tgamma_stirling (x);
+    } else {
+      double t;
+      int quot;
+      if (x == floor(x)) {
+        return 0.0 / (x - floor(x));
+      }
+      if (x < -185.0) {
+        int negative;
+        x = floor(x);
+        negative = ((x - (2.0 * floor(0.5 * x))) == 1.0);
+        return negative ? (-1.0 / 1e308 / 1e308) : CUDART_ZERO;
+      }
+      /* compute sin(pi*x) accurately */
+      xx = rint (2.0 * x);
+      quot = static_cast<int>(xx);
+      xx = -0.5 * xx + x;
+      xx = xx * CUDART_PI;
+      if (quot & 1) {
+        xx = cos (xx);
+      } else {
+        xx = sin (xx);
+      }
+      if (quot & 2) {
+        xx = -xx;
+      }
+      x = fabs (x);
+      s = exp (-x);
+      t = x - 0.5;
+      if (x > 140.0) t = 0.5 * t;
+      t = pow (x, t);
+      if (x > 140.0) s = s * t;
+      s = s * __host_stirling_poly (x);
+      s = s * x;
+      s = s * xx;
+      s = 1.0 / s;
+      s = s * CUDART_SQRT_PIO2;
+      s = s / t;
+      return s;
+    }
+  }
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+/*******************************************************************************
+*                                                                              *
+* HOST IMPLEMENTATION FOR FLOAT AND LONG DOUBLE ROUTINES FOR WINDOWS PLATFORM  *
+* MAP FLOAT AND LONG DOUBLE ROUTINES TO DOUBLE ROUTINES                        *
+*                                                                              *
+*******************************************************************************/
+
+__func__(int __signbitl(const long double a))
+{
+  return __signbit(static_cast<double>(a));
+}
+
+__func__(int __signbitf(const float a))
+{
+  return __signbit(static_cast<double>(a));
+}
+
+__func__(int __finitel(const long double a))
+{
+  return __finite(static_cast<double>(a));
+}
+
+__func__(int __finitef(const float a))
+{
+  return __finite(static_cast<double>(a));
+}
+
+__func__(int __isinfl(const long double a))
+{
+  return __isinf(static_cast<double>(a));
+}
+
+__func__(int __isinff(const float a))
+{
+  return __isinf(static_cast<double>(a));
+}
+
+__func__(int __isnanl(const long double a))
+{
+  return __isnan(static_cast<double>(a));
+}
+
+__func__(int __isnanf(const float a))
+{
+  return __isnan(static_cast<double>(a));
+}
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+__func__(float fmaxf(const float a, const float b))
+{
+  return static_cast<float>(fmax(static_cast<double>(a), static_cast<double>(b)));
+}
+
+__func__(float fminf(const float a, const float b))
+{
+  return static_cast<float>(fmin(static_cast<double>(a), static_cast<double>(b)));
+}
+
+__func__(float roundf(const float a))
+{
+  return static_cast<float>(round(static_cast<double>(a)));
+}
+
+__func__(long int lroundf(const float a))
+{
+  return lround(static_cast<double>(a));
+}
+
+__func__(long long int llroundf(const float a))
+{
+  return llround(static_cast<double>(a));
+}
+
+__func__(float truncf(const float a))
+{
+  return static_cast<float>(trunc(static_cast<double>(a)));
+}
+
+__func__(float rintf(const float a))
+{
+  return static_cast<float>(rint(static_cast<double>(a)));
+}
+
+__func__(float nearbyintf(const float a))
+{
+  return static_cast<float>(nearbyint(static_cast<double>(a)));
+}
+
+__func__(long int lrintf(const float a))
+{
+  return lrint(static_cast<double>(a));
+}
+
+__func__(long long int llrintf(const float a))
+{
+  return llrint(static_cast<double>(a));
+}
+
+__func__(float logbf(const float a))
+{
+  return static_cast<float>(logb(static_cast<double>(a)));
+}
+
+__func__(float scalblnf(const float a, const long int b))
+{
+  return static_cast<float>(scalbln(static_cast<double>(a), b));
+}
+
+__func__(float log2f(const float a))
+{
+  return static_cast<float>(log2(static_cast<double>(a)));
+}
+
+__func__(float exp2f(const float a))
+{
+  return static_cast<float>(exp2(static_cast<double>(a)));
+}
+
+__func__(float acoshf(const float a))
+{
+  return static_cast<float>(acosh(static_cast<double>(a)));
+}
+
+__func__(float asinhf(const float a))
+{
+  return static_cast<float>(asinh(static_cast<double>(a)));
+}
+
+__func__(float atanhf(const float a))
+{
+  return static_cast<float>(atanh(static_cast<double>(a)));
+}
+
+__func__(float cbrtf(const float a))
+{
+  return static_cast<float>(cbrt(static_cast<double>(a)));
+}
+
+__func__(float expm1f(const float a))
+{
+  return static_cast<float>(expm1(static_cast<double>(a)));
+}
+
+__func__(float fdimf(const float a, const float b))
+{
+  return static_cast<float>(fdim(static_cast<double>(a), static_cast<double>(b)));
+}
+
+__func__(float log1pf(const float a))
+{
+  return static_cast<float>(log1p(static_cast<double>(a)));
+}
+
+__func__(float scalbnf(const float a, const int b))
+{
+  return static_cast<float>(scalbn(static_cast<double>(a), b));
+}
+
+__func__(float fmaf(const float a, const float b, const float c))
+{
+  return static_cast<float>(fma(static_cast<double>(a), static_cast<double>(b), static_cast<double>(c)));
+}
+
+__func__(int ilogbf(const float a))
+{
+  return ilogb(static_cast<double>(a));
+}
+
+__func__(float erff(const float a))
+{
+  return static_cast<float>(erf(static_cast<double>(a)));
+}
+
+__func__(float erfcf(const float a))
+{
+  return static_cast<float>(erfc(static_cast<double>(a)));
+}
+
+__func__(float lgammaf(const float a))
+{
+  return static_cast<float>(lgamma(static_cast<double>(a)));
+}
+
+__func__(float tgammaf(const float a))
+{
+  return static_cast<float>(tgamma(static_cast<double>(a)));
+}
+
+__func__(float remquof(const float a, const float b, int *quo))
+{
+  return static_cast<float>(remquo(static_cast<double>(a), static_cast<double>(b), quo));
+}
+
+__func__(float remainderf(const float a, const float b))
+{
+  return static_cast<float>(remainder(static_cast<double>(a), static_cast<double>(b)));
+}
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#if (defined _MSC_VER) && (_MSC_VER >= 1700)
+__func__(float j0f(const float a))
+{
+  return static_cast<float>(_j0(static_cast<double>(a)));
+}
+
+__func__(float j1f(const float a))
+{
+  return static_cast<float>(_j1(static_cast<double>(a)));
+}
+
+__func__(float jnf(const int n, const float a))
+{
+  return static_cast<float>(_jn(n, static_cast<double>(a)));
+}
+
+__func__(float y0f(const float a))
+{
+  return static_cast<float>(_y0(static_cast<double>(a)));
+}
+
+__func__(float y1f(const float a))
+{
+  return static_cast<float>(_y1(static_cast<double>(a)));
+}
+
+__func__(float ynf(const int n, const float a))
+{
+  return static_cast<float>(_yn(n, static_cast<double>(a)));
+}
+#endif /* (defined _MSC_VER) && (_MSC_VER >= 1700) */
+
+
+/*******************************************************************************
+*                                                                              *
+* HOST IMPLEMENTATION FOR FLOAT ROUTINES FOR WINDOWS PLATFORM                  *
+*                                                                              *
+*******************************************************************************/
+
+#if (!defined(_MSC_VER) || _MSC_VER < 1800)
+__func__(float copysignf(float a, const float b))
+{
+  unsigned int aa, bb;
+  memcpy(&aa, &a, sizeof(float));
+  memcpy(&bb, &b, sizeof(float));
+  aa = (aa & ~0x80000000U) | (bb & 0x80000000U);
+  memcpy(&a, &aa, sizeof(float));
+  return a;
+}
+
+__func__(float nextafterf(float a, const float b))
+{
+  unsigned int ia;
+  unsigned int ib;
+  memcpy(&ia, &a, sizeof(float));
+  memcpy(&ib, &b, sizeof(float));
+  if (__isnanf(a) || __isnanf(b)) return a + b; /*NaN*/
+  if (((ia | ib) << 1U) == 0U) return b;
+  if (a == 0.0F) {
+    return copysignf(1.401298464e-045F, b); /*crossover*/
+  }
+  if ((a < b) && (a < 0.0F)) ia--;
+  if ((a < b) && (a > 0.0F)) ia++;
+  if ((a > b) && (a < 0.0F)) ia++;
+  if ((a > b) && (a > 0.0F)) ia--;
+  memcpy(&a, &ia, sizeof(float));
+  return a;
+}
+
+__func__(float nanf(const char *tagp))
+{
+  float f;
+  unsigned int i;
+  i = static_cast<unsigned int>(__internal_host_nan_kernel(tagp));
+  i = (i & 0x007fffffU) | 0x7fc00000U;
+  memcpy(&f, &i, sizeof(float));
+  return f;
+}
+
+#endif /* (!defined(_MSC_VER) || _MSC_VER < 1800) */
+
+#endif /* _WIN32 */
+
+/*******************************************************************************
+*                                                                              *
+* HOST IMPLEMENTATION FOR DOUBLE AND FLOAT ROUTINES. ALL PLATFORMS             *
+*                                                                              *
+*******************************************************************************/
+
+__func__(double rsqrt(const double a))
+{
+  return 1.0 / sqrt(a);
+}
+
+__func__(double rcbrt(const double a))
+{
+  double s, t;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  if (a == 0.0 || __isinf(a)) {
+    return 1.0 / a;
+  } 
+  s = fabs(a);
+  t = exp2(-CUDART_THIRD * log2(s));                /* initial approximation */
+  t = ((t*t) * (-s*t) + 1.0) * (CUDART_THIRD*t) + t;/* refine approximation */
+#if defined(__APPLE__)
+  if (__signbitd(a))
+#else /* __APPLE__ */
+  if (__signbit(a))
+#endif /* __APPLE__ */
+  {
+    t = -t;
+  }
+  return t;
+}
+
+__func__(double sinpi(double a))
+{
+  int n;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  if (a == 0.0 || __isinf(a)) {
+    return sin (a);
+  } 
+  if (a == floor(a)) {
+    return ((a / 1.0e308) / 1.0e308) / 1.0e308;
+  }
+  double twoa = a + a;
+  double rtwoa = round(twoa);
+  long long int l = (long long int)rtwoa;
+  n = (int)l;
+  a -= rtwoa * 0.5;
+  a = a * CUDART_PI;
+  if (n & 1) {
+    a = cos (a);
+  } else {
+    a = sin (a);
+  }
+  if (n & 2) {
+    a = -a;
+  }
+  return a;
+}
+
+__func__(double cospi(double a))
+{
+  int n;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  if (__isinf(a)) {
+    return cos (a);
+  } 
+  if (fabs(a) > 9.0071992547409920e+015) {
+    a = 0.0;
+  }
+  double twoa = a + a;
+  double rtwoa = round(twoa);
+  long long int l = (long long int)rtwoa;
+  n = (int)l;
+  a -= rtwoa * 0.5;
+  a = a * CUDART_PI;
+  n++;
+  if (n & 1) {
+    a = cos (a);
+  } else {
+    a = sin (a);
+  }
+  if (n & 2) {
+    a = -a;
+  }
+  if (a == 0.0) {
+    a = fabs(a);
+  }
+  return a;
+}
+
+__func__(void sincospi(const double a, double *sptr, double *cptr))
+{
+  *sptr = sinpi(a);
+  *cptr = cospi(a);
+}
+
+__func__(double erfinv(const double a))
+{
+  double p, q, t, fa;
+  unsigned long long int l;
+
+  fa = fabs(a);
+  if (fa >= 1.0) {
+    l = 0xfff8000000000000ULL;
+    memcpy(&t, &l, sizeof(double)); /* INDEFINITE */
+    if (fa == 1.0) {
+      t = a * exp(1000.0);          /* Infinity */
+    }
+  } else if (fa >= 0.9375) {
+    /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
+       Approximations for the Inverse of the Error Function. Mathematics of
+       Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
+     */
+    t = log1p(-fa);
+    t = 1.0 / sqrt(-t);
+    p =         2.7834010353747001060e-3;
+    p = p * t + 8.6030097526280260580e-1;
+    p = p * t + 2.1371214997265515515e+0;
+    p = p * t + 3.1598519601132090206e+0;
+    p = p * t + 3.5780402569085996758e+0;
+    p = p * t + 1.5335297523989890804e+0;
+    p = p * t + 3.4839207139657522572e-1;
+    p = p * t + 5.3644861147153648366e-2;
+    p = p * t + 4.3836709877126095665e-3;
+    p = p * t + 1.3858518113496718808e-4;
+    p = p * t + 1.1738352509991666680e-6;
+    q =     t + 2.2859981272422905412e+0;
+    q = q * t + 4.3859045256449554654e+0;
+    q = q * t + 4.6632960348736635331e+0;
+    q = q * t + 3.9846608184671757296e+0;
+    q = q * t + 1.6068377709719017609e+0;
+    q = q * t + 3.5609087305900265560e-1;
+    q = q * t + 5.3963550303200816744e-2;
+    q = q * t + 4.3873424022706935023e-3;
+    q = q * t + 1.3858762165532246059e-4;
+    q = q * t + 1.1738313872397777529e-6;
+    t = p / (q * t);
+    if (a < 0.0) t = -t;
+  } else if (fa >= 0.75) {
+    /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
+       Approximations for the Inverse of the Error Function. Mathematics of
+       Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 39
+    */
+    t = a * a - .87890625;
+    p =         .21489185007307062000e+0;
+    p = p * t - .64200071507209448655e+1;
+    p = p * t + .29631331505876308123e+2;
+    p = p * t - .47644367129787181803e+2;
+    p = p * t + .34810057749357500873e+2;
+    p = p * t - .12954198980646771502e+2;
+    p = p * t + .25349389220714893917e+1;
+    p = p * t - .24758242362823355486e+0;
+    p = p * t + .94897362808681080020e-2;
+    q =     t - .12831383833953226499e+2;
+    q = q * t + .41409991778428888716e+2;
+    q = q * t - .53715373448862143349e+2;
+    q = q * t + .33880176779595142685e+2;
+    q = q * t - .11315360624238054876e+2;
+    q = q * t + .20369295047216351160e+1;
+    q = q * t - .18611650627372178511e+0;
+    q = q * t + .67544512778850945940e-2;
+    p = p / q;
+    t = a * p;
+  } else {
+    /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
+       Approximations for the Inverse of the Error Function. Mathematics of
+       Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 18
+    */
+    t = a * a - .5625;
+    p =       - .23886240104308755900e+2;
+    p = p * t + .45560204272689128170e+3;
+    p = p * t - .22977467176607144887e+4;
+    p = p * t + .46631433533434331287e+4;
+    p = p * t - .43799652308386926161e+4;
+    p = p * t + .19007153590528134753e+4;
+    p = p * t - .30786872642313695280e+3;
+    q =     t - .83288327901936570000e+2;
+    q = q * t + .92741319160935318800e+3;
+    q = q * t - .35088976383877264098e+4;
+    q = q * t + .59039348134843665626e+4;
+    q = q * t - .48481635430048872102e+4;
+    q = q * t + .18997769186453057810e+4;
+    q = q * t - .28386514725366621129e+3;
+    p = p / q;
+    t = a * p;
+  }
+  return t;
+}
+
+__func__(double erfcinv(const double a))
+{
+  double t;
+  unsigned long long int l;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  if (a <= 0.0) {
+    l = 0xfff8000000000000ULL;
+    memcpy(&t, &l, sizeof(double));   /* INDEFINITE */
+    if (a == 0.0) {
+        t = (1.0 - a) * exp(1000.0);  /* Infinity */
+    }
+  } 
+  else if (a >= 0.0625) {
+    t = erfinv (1.0 - a);
+  }
+  else if (a >= 1e-100) {
+    /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
+       Approximations for the Inverse of the Error Function. Mathematics of
+       Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 59
+    */
+    double p, q;
+    t = log(a);
+    t = 1.0 / sqrt(-t);
+    p =         2.7834010353747001060e-3;
+    p = p * t + 8.6030097526280260580e-1;
+    p = p * t + 2.1371214997265515515e+0;
+    p = p * t + 3.1598519601132090206e+0;
+    p = p * t + 3.5780402569085996758e+0;
+    p = p * t + 1.5335297523989890804e+0;
+    p = p * t + 3.4839207139657522572e-1;
+    p = p * t + 5.3644861147153648366e-2;
+    p = p * t + 4.3836709877126095665e-3;
+    p = p * t + 1.3858518113496718808e-4;
+    p = p * t + 1.1738352509991666680e-6;
+    q =     t + 2.2859981272422905412e+0;
+    q = q * t + 4.3859045256449554654e+0;
+    q = q * t + 4.6632960348736635331e+0;
+    q = q * t + 3.9846608184671757296e+0;
+    q = q * t + 1.6068377709719017609e+0;
+    q = q * t + 3.5609087305900265560e-1;
+    q = q * t + 5.3963550303200816744e-2;
+    q = q * t + 4.3873424022706935023e-3;
+    q = q * t + 1.3858762165532246059e-4;
+    q = q * t + 1.1738313872397777529e-6;
+    t = p / (q * t);
+  }
+  else {
+    /* Based on: J.M. Blair, C.A. Edwards, J.H. Johnson: Rational Chebyshev
+       Approximations for the Inverse of the Error Function. Mathematics of
+       Computation, Vol. 30, No. 136 (Oct. 1976), pp. 827-830. Table 82
+    */
+    double p, q;
+    t = log(a);
+    t = 1.0 / sqrt(-t);
+    p =         6.9952990607058154858e-1;
+    p = p * t + 1.9507620287580568829e+0;
+    p = p * t + 8.2810030904462690216e-1;
+    p = p * t + 1.1279046353630280005e-1;
+    p = p * t + 6.0537914739162189689e-3;
+    p = p * t + 1.3714329569665128933e-4;
+    p = p * t + 1.2964481560643197452e-6;
+    p = p * t + 4.6156006321345332510e-9;
+    p = p * t + 4.5344689563209398450e-12;
+    q =     t + 1.5771922386662040546e+0;
+    q = q * t + 2.1238242087454993542e+0;
+    q = q * t + 8.4001814918178042919e-1;
+    q = q * t + 1.1311889334355782065e-1;
+    q = q * t + 6.0574830550097140404e-3;
+    q = q * t + 1.3715891988350205065e-4;
+    q = q * t + 1.2964671850944981713e-6;
+    q = q * t + 4.6156017600933592558e-9;
+    q = q * t + 4.5344687377088206783e-12;
+    t = p / (q * t);
+  }
+  return t;
+}
+
+__func__(double normcdfinv(const double a))
+{
+  return -1.4142135623730951 * erfcinv(a + a);
+}
+
+__func__(double normcdf(double a))
+{
+  double ah, al, t1, t2, u1, u2, v1, v2, z;
+  if (fabs (a) > 38.5) a = copysign (38.5, a);
+  ah = a * 134217729.0;
+  u1 = (a - ah) + ah;
+  u2 = a - u1;
+  v1 = -7.0710678398609161e-01;
+  v2 =  2.7995440410322203e-09;
+  t1 = a * -CUDART_SQRT_HALF_HI;
+  t2 = (((u1 * v1 - t1) + u1 * v2) + u2 * v1) + u2 * v2;
+  t2 = (a * -CUDART_SQRT_HALF_LO) + t2;
+  ah = t1 + t2;
+  z = erfc (ah);
+  if (a < -1.0) {
+    al = (t1 - ah) + t2;
+    t1 = -2.0 * ah * z;
+    z = t1 * al + z;
+  }
+  return 0.5 * z;
+}
+
+__func__(double erfcx(const double a))
+{
+  double x, t1, t2, t3;
+
+  if (__isnan(a)) {
+    return a + a;
+  }
+  x = fabs(a); 
+  if (x < 32.0) {
+    /*  
+     * This implementation of erfcx() is based on the algorithm in: M. M. 
+     * Shepherd and J. G. Laframboise, "Chebyshev Approximation of (1 + 2x)
+     * exp(x^2)erfc x in 0 <= x < INF", Mathematics of Computation, Vol. 
+     * 36, No. 153, January 1981, pp. 249-253. For the core approximation,
+     * the input domain [0,INF] is transformed via (x-k) / (x+k) where k is
+     * a precision-dependent constant. Here, we choose k = 4.0, so the input 
+     * domain [0, 27.3] is transformed into the core approximation domain 
+     * [-1, 0.744409].   
+     */
+    /*
+    // Compute (1+2*x)*exp(x*x)*erfc(x)
+    */
+    /* t2 = (x-4.0)/(x+4.0), transforming [0,INF] to [-1,+1] */ 
+    t1 = x - 4.0; 
+    t2 = x + 4.0; 
+    t2 = t1 / t2;
+    /* approximate on [-1, 0.744409] */   
+    t1 =         - 3.5602694826817400E-010; 
+    t1 = t1 * t2 - 9.7239122591447274E-009; 
+    t1 = t1 * t2 - 8.9350224851649119E-009; 
+    t1 = t1 * t2 + 1.0404430921625484E-007; 
+    t1 = t1 * t2 + 5.8806698585341259E-008; 
+    t1 = t1 * t2 - 8.2147414929116908E-007; 
+    t1 = t1 * t2 + 3.0956409853306241E-007; 
+    t1 = t1 * t2 + 5.7087871844325649E-006; 
+    t1 = t1 * t2 - 1.1231787437600085E-005; 
+    t1 = t1 * t2 - 2.4399558857200190E-005; 
+    t1 = t1 * t2 + 1.5062557169571788E-004; 
+    t1 = t1 * t2 - 1.9925637684786154E-004; 
+    t1 = t1 * t2 - 7.5777429182785833E-004; 
+    t1 = t1 * t2 + 5.0319698792599572E-003; 
+    t1 = t1 * t2 - 1.6197733895953217E-002; 
+    t1 = t1 * t2 + 3.7167515553018733E-002; 
+    t1 = t1 * t2 - 6.6330365827532434E-002; 
+    t1 = t1 * t2 + 9.3732834997115544E-002; 
+    t1 = t1 * t2 - 1.0103906603555676E-001; 
+    t1 = t1 * t2 + 6.8097054254735140E-002; 
+    t1 = t1 * t2 + 1.5379652102605428E-002; 
+    t1 = t1 * t2 - 1.3962111684056291E-001; 
+    t1 = t1 * t2 + 1.2329951186255526E+000; 
+    /*
+    // Note: (1+2*x)*exp(x*x)*erfc(x) / (1+2*x) = exp(x*x)*erfc(x)
+    */
+    t2 = 2.0 * x + 1.0; 
+    t1 = t1 / t2;
+  } else {
+    /* asymptotic expansion for large aguments */
+    t2 = 1.0 / x;
+    t3 = t2 * t2;
+    t1 =         -29.53125;
+    t1 = t1 * t3 + 6.5625;
+    t1 = t1 * t3 - 1.875;
+    t1 = t1 * t3 + 0.75;
+    t1 = t1 * t3 - 0.5;
+    t1 = t1 * t3 + 1.0;
+    t2 = t2 * 5.6418958354775628e-001;
+    t1 = t1 * t2;
+  }
+  if (a < 0.0) {
+    /*
+    // Note: erfcx(x) = 2*exp(x^2) - erfcx(|x|)
+    */
+    t2 = (static_cast<int>(x * 16.0)) * 0.0625;
+    t3 = (x - t2) * (x + t2);
+    t3 = exp(t2 * t2) * exp(t3);
+    t3 = t3 + t3;
+    t1 = t3 - t1;
+  }
+  return t1;
+}
+
+__func__(float rsqrtf(const float a))
+{
+  return static_cast<float>(rsqrt(static_cast<double>(a)));
+}
+
+__func__(float rcbrtf(const float a))
+{
+  return static_cast<float>(rcbrt(static_cast<double>(a)));
+}
+
+__func__(float sinpif(const float a))
+{
+  return static_cast<float>(sinpi(static_cast<double>(a)));
+}
+
+__func__(float cospif(const float a))
+{
+  return static_cast<float>(cospi(static_cast<double>(a)));
+}
+
+__func__(void sincospif(const float a, float *sptr, float *cptr))
+{
+  double s, c;
+
+  sincospi(static_cast<double>(a), &s, &c);
+  *sptr = static_cast<float>(s);
+  *cptr = static_cast<float>(c);
+}
+
+__func__(float erfinvf(const float a))
+{
+  return static_cast<float>(erfinv(static_cast<double>(a)));
+}
+
+__func__(float erfcinvf(const float a))
+{
+  return static_cast<float>(erfcinv(static_cast<double>(a)));
+}
+
+__func__(float normcdfinvf(const float a))
+{
+  return static_cast<float>(normcdfinv(static_cast<double>(a)));
+}
+
+__func__(float normcdff(const float a))
+{
+  return static_cast<float>(normcdf(static_cast<double>(a)));
+}
+
+__func__(float erfcxf(const float a))
+{
+  return static_cast<float>(erfcx(static_cast<double>(a)));
+}
+
+#if defined(_WIN32)
+#pragma warning (pop)
+#endif /* _WIN32 */
+
+#endif /* !__CUDACC__ */
+
+#endif /* !__MATH_FUNCTIONS_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_HPP__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/nvfunctional b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/nvfunctional
new file mode 100644
index 0000000000000000000000000000000000000000..5cb9ffeb9cb9f1d202cb1f5cb1d4d7e88a416475
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/nvfunctional
@@ -0,0 +1,621 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2014-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/nvfunctional is an internal header file and must not be used directly.  Please use nvfunctional instead.")
+#else
+#warning "crt/nvfunctional is an internal header file and must not be used directly.  Please use nvfunctional instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__
+#endif
+
+#ifndef __NV_LIBCXX_FUNCTIONAL_H__
+#define __NV_LIBCXX_FUNCTIONAL_H__
+
+#if __cplusplus < 201103L 
+  #if defined(_MSC_VER)
+    #if _MSC_VER < 1800
+      #error This library requires VS 2013 and above
+    #endif /* _MSC_VER < 1800 */
+  #else /* !_MSC_VER */
+    #error This library requires support for the ISO C++ 2011 standard
+  #endif /* _MSC_VER */
+#endif /* __cplusplus */
+
+#if defined(_MSC_VER)
+  #define __NV_ALIGNOF __alignof
+  #define __NV_NOEXCEPT
+  #define __NV_CONSTEXPR
+#else /* !_MSC_VER */
+  #define __NV_ALIGNOF alignof
+  #define __NV_NOEXCEPT noexcept
+  #define __NV_CONSTEXPR constexpr
+#endif /* _MSC_VER */
+
+#include <type_traits>
+#include <cstddef>
+#include <new>
+
+// n3290 20.8
+namespace nvstd
+{
+
+namespace internal {
+
+// D.8.1 base (deprecated) [depr.base]
+template <class _Arg, class _Result>
+struct unary_function
+{
+  typedef _Arg argument_type;
+  typedef _Result result_type;
+};
+
+template <class _Arg1, class _Arg2, class _Result>
+struct binary_function
+{
+  typedef _Arg1 first_argument_type;
+  typedef _Arg2 second_argument_type;
+  typedef _Result result_type;
+};
+
+// move
+template <class _T>
+inline __device__ __host__
+typename std::remove_reference<_T>::type&& move(_T&& __t) __NV_NOEXCEPT
+{
+  return static_cast<typename std::remove_reference<_T>::type&&>(__t);
+}
+
+// 20.2.2 swap [utility.swap]
+// swap
+template<class _T, 
+         class = typename std::enable_if<
+                   std::is_move_constructible<_T>::value &&
+                   std::is_move_assignable<_T>::value>::type>
+inline __device__ __host__
+void swap(_T& __a, _T& __b) 
+#if !defined(_MSC_VER)
+noexcept(std::is_nothrow_move_constructible<_T>::value &&
+         std::is_nothrow_move_assignable<_T>::value)
+#endif /* !defined(_MSC_VER) */
+{
+  _T __t(internal::move(__a));
+  __a = internal::move(__b);
+  __b = internal::move(__t);
+}
+
+// 20.2.3 forward/move helpers [forward]
+// forward
+template <class _T> 
+inline __device__ __host__
+_T&& forward(typename std::remove_reference<_T>::type& __t) __NV_NOEXCEPT
+{
+  return static_cast<_T&&>(__t);
+}
+
+template <class _T> 
+inline __device__ __host__
+_T&& forward(typename std::remove_reference<_T>::type&& __t) __NV_NOEXCEPT
+{
+  static_assert(!std::is_lvalue_reference<_T>::value,
+                "Error: __t is instantiated with an lvalue reference type");
+  return static_cast<_T&&>(__t);
+}
+
+} // namespace internal
+
+namespace __functional_helpers
+{
+
+struct __dummy_class;
+
+// Store small functors locally:
+// a functor is legitimate to local storage if it is one of the following types:
+// * member object pointer;
+// * member function pointer;
+// * closure type of size less than or equal to the largest size of 
+//   the above types;
+// * function pointer;
+// * any callable class whose size is less than or equal to
+//   the largest one of the above types;
+union _Small_functor_types 
+{
+  void *__obj;
+  void (*__func_ptr)();
+  void (__dummy_class::*mem_fn_ptr)();
+};
+
+struct _Small_functor_data {
+  char __data[sizeof(_Small_functor_types)];
+};
+
+template <class _RetType, class ..._ArgTypes>
+struct __maybe_base_function
+{ };
+
+template <class _RetType, class _T1>
+struct __maybe_base_function<_RetType(_T1)>
+  : public internal::unary_function<_T1, _RetType>
+{ };
+
+template <class _RetType, class _T1, class _T2>
+struct __maybe_base_function<_RetType(_T1, _T2)>
+  : public internal::binary_function<_T1, _T2, _RetType>
+{ };
+
+} // namespace __functional_helpers
+
+// 20.8.11 Polymorphic function wrappers [func.wrap]
+
+// 20.8.11.1 Class bad_function_call [func.wrap.badcall]
+// unimplemented because of exception
+// class bad_function_call : public std::exception
+
+// 20.8.11.2 Class template function [func.wrap.func]
+
+template<class> class function; // undefined
+
+// Simplified version of template class function, which
+//   * does not support allocator_arg_t;
+//   * does not support target and target_type that rely on RTTI
+//   * does not throw bad_function_call exception on invoking a NULL target
+template <class _RetType, class ..._ArgTypes>
+class function<_RetType(_ArgTypes...)> 
+  : public __functional_helpers::__maybe_base_function<_RetType(_ArgTypes...)>
+{
+  __functional_helpers::_Small_functor_data __small_functor_data;
+  void *__obj;
+  typedef _RetType(*__meta_fn_type)(void *, _ArgTypes...);
+  __meta_fn_type __meta_fn;
+  typedef void(*__cloner_type)(function &, const function &);
+  __cloner_type __cloner;
+  typedef void(*__destructor_type)(function *);
+  __destructor_type __destructor;
+
+  #pragma nv_exec_check_disable
+  template <class _F>
+  __device__ __host__
+  __NV_CONSTEXPR bool __use_small_functor_data() const
+  {
+    return (sizeof(_F) <= sizeof(__small_functor_data) &&
+            __NV_ALIGNOF(_F) <= __NV_ALIGNOF(
+                                  __functional_helpers::_Small_functor_types));
+  }
+
+  #pragma nv_exec_check_disable
+  __device__ __host__
+  void* __get_small_functor_data() const
+  {
+    return (void*)(&__small_functor_data.__data[0]);
+  }
+
+  #pragma nv_exec_check_disable
+  __device__ __host__
+  bool __is_small_functor_data() const
+  {
+    return __obj == __get_small_functor_data();
+  }
+
+  #pragma nv_exec_check_disable
+  template <class _F>
+  __device__ __host__
+  static _F& __get_functor(void *__p)
+  {
+    return *((_F*)__p);
+  }
+
+  #pragma nv_exec_check_disable
+  template <class _F>
+  __device__ __host__
+  static bool __is_empty_functor(const _F& /*__p*/)
+  {
+    return false;
+  }
+
+  #pragma nv_exec_check_disable
+  template <class _F>
+  __device__ __host__
+  static bool __is_empty_functor(const _F* __p)
+  {
+    return !__p;
+  }
+  
+  #pragma nv_exec_check_disable
+  template <class _Res, class _C>
+  __device__ __host__
+  static bool __is_empty_functor(const _Res _C::* __p)
+  {
+    return !__p;
+  }
+ 
+  #pragma nv_exec_check_disable
+  template <class _Res, class... _Args>
+  __device__ __host__
+  static bool __is_empty_functor(const function<_Res(_Args...)>& __p)
+  {
+    return !__p;
+  }
+  
+  template <class _F>
+  struct __make_cloner
+  {
+    #pragma nv_exec_check_disable
+    __device__ __host__
+    static void __clone_data(function &__dest, const function &__src)
+    {
+      if (__dest.__use_small_functor_data<_F>()) {
+        __dest.__obj = __dest.__get_small_functor_data();
+        new (__dest.__obj) _F(__src.__get_functor<_F>(__src.__obj));
+      }
+      else {
+        __dest.__obj = new _F(__src.__get_functor<_F>(__src.__obj));
+      }
+    }
+  };
+
+  template <class _F>
+  struct __make_destructor
+  {
+    #pragma nv_exec_check_disable
+    __device__ __host__
+    static void __destruct(function *__fn)
+    {
+      if (__fn->__use_small_functor_data<_F>()) {
+        (__fn->__get_functor<_F>(__fn->__obj)).~_F();
+      }
+      else {
+        delete (_F*)(__fn->__obj);
+      }
+    }
+  };
+
+  // We cannot simple define __make_functor in the following way:
+  // template <class _T, _F>
+  // __make_functor;
+  // template <class _RetType1, class _F, class... _ArgTypes1>
+  // struct __make_functor<_RetType1(_ArgTypes1...), _F> 
+  //
+  // because VS 2013 cannot unpack _RetType1(_ArgTypes1...)
+  template <class _RetType1, class _F, class... _ArgTypes1>
+  struct __make_functor
+  {
+    typedef _RetType1 type;
+
+    #pragma nv_exec_check_disable
+    __device__ __host__
+    static _RetType1 __invoke(void *__d, _ArgTypes1... __args)
+    {
+      return __get_functor<_F>(__d)(
+               internal::forward<_ArgTypes1>(__args)...);
+    }
+  };
+
+  template <class _RetType1, class _C, class _M, class... _ArgTypes1>
+  struct __make_functor<_RetType1, _M _C::*,_ArgTypes1...>
+  {
+    typedef _RetType1 type;
+    typedef _RetType1(*_Fn)(_ArgTypes1...);
+
+    #pragma nv_exec_check_disable    
+    __device__ __host__
+    static _RetType1 __invoke(void *__d, _ArgTypes1... __args)
+    {
+      return __get_functor<_Fn>(__d)(
+               internal::forward<_ArgTypes1>(__args)...);
+    }
+  };
+
+// workaround for GCC version below 4.8
+#if (__GNUC__ == 4) && (__GNUC_MINOR__ < 8)
+  template <class _F>
+  struct __check_callability
+    : public std::integral_constant<bool, 
+                                    !std::is_same<_F, std::nullptr_t>::value>
+  { };
+#elif defined(_MSC_VER)
+  // simulate VC 2013's behavior...
+  template <class _F>
+  struct __check_callability1
+    : public 
+        std::integral_constant<bool, 
+          // std::result_of does not handle member pointers well 
+          std::is_member_pointer<_F>::value ||
+          std::is_convertible<
+            _RetType,
+            typename std::result_of<_F(_ArgTypes...)>::type
+          >::value
+        >
+  { };
+
+  template <class _F>
+  struct __check_callability
+    : public std::integral_constant<
+               bool,
+               !std::is_same<_F, function>::value && 
+               __check_callability1<typename std::remove_cv<_F>::type>::value>
+  { };
+#else /* !((__GNUC__ == 4) && (__GNUC_MINOR__ < 8)) _MSC_VER */
+  template <class _F,
+            class _T = typename std::result_of<_F(_ArgTypes...)>::type>
+  struct __check_callability
+    : public std::integral_constant<
+               bool,
+               !std::is_same<_F, function>::value && 
+                 std::is_convertible< _T, _RetType>::value>
+  { };
+#endif /* __GNUC__ == 4) && (__GNUC_MINOR__ < 8) */
+
+  #pragma nv_exec_check_disable
+  __device__ __host__
+  void __destroy()
+  {
+    if (__obj) {
+      __destructor(this);
+      __obj = 0;
+    }
+  }
+  
+  #pragma nv_exec_check_disable 
+  __device__ __host__
+  void __clear()
+  {
+    __obj = 0;
+    __meta_fn = 0;
+    __cloner = 0;
+    __destructor = 0;
+  }
+
+public:
+  typedef _RetType result_type;
+
+/* 
+ * These typedef(s) are derived from __maybe_base_function
+ * typedef T1 argument_type;        // only if sizeof...(ArgTypes) == 1 and
+ *                                  // the type in ArgTypes is T1
+ * typedef T1 first_argument_type;  // only if sizeof...(ArgTypes) == 2 and
+ *                                  // ArgTypes contains T1 and T2
+ * typedef T2 second_argument_type; // only if sizeof...(ArgTypes) == 2 and
+ *                                  // ArgTypes contains T1 and T2
+ */
+
+  // 20.8.11.2.1 construct/copy/destroy [func.wrap.con]
+  
+  #pragma nv_exec_check_disable 
+  __device__ __host__ 
+  function() __NV_NOEXCEPT
+    : __obj(0), __meta_fn(0), __cloner(0), __destructor(0) {}
+
+  #pragma nv_exec_check_disable 
+  __device__ __host__ 
+  function(std::nullptr_t) __NV_NOEXCEPT
+    : __obj(0), __meta_fn(0), __cloner(0), __destructor(0) {}
+
+  #pragma nv_exec_check_disable 
+  __device__ __host__ 
+  function(const function &__fn)
+  {
+    if (__fn.__obj == 0) {
+      __clear();
+    }
+    else {
+      __meta_fn = __fn.__meta_fn;
+      __destructor = __fn.__destructor;
+      __fn.__cloner(*this, __fn);
+      __cloner = __fn.__cloner;
+    }
+  }
+
+  #pragma nv_exec_check_disable 
+  __device__ __host__ 
+  function(function &&__fn)
+  {
+    __fn.swap(*this);
+  }
+
+  // VS 2013 cannot process __check_callability type trait.
+  // So, we check callability using static_assert instead of
+  // using SFINAE such as
+  // template<class _F, 
+  //          class = typename std::enable_if<
+  //                    __check_callability<_F>::value
+  //         >::type>
+  
+  #pragma nv_exec_check_disable   
+  template<class _F>
+  __device__ __host__ 
+  function(_F);
+
+  // copy and swap
+  #pragma nv_exec_check_disable   
+  __device__ __host__
+  function& operator=(const function& __fn)
+  {
+    function(__fn).swap(*this);
+    return *this;
+  }
+
+  #pragma nv_exec_check_disable 
+  __device__ __host__
+  function& operator=(function&& __fn)
+  {
+    function(internal::move(__fn)).swap(*this);
+    return *this;
+  }
+
+  #pragma nv_exec_check_disable 
+  __device__ __host__
+  function& operator=(std::nullptr_t)
+  {
+    __destroy();
+    return *this;
+  }
+
+  #pragma nv_exec_check_disable
+  template<class _F>
+  __device__ __host__
+  function&
+  operator=(_F&& __fn) 
+  {
+    static_assert(__check_callability<_F>::value,
+                  "Unable to create functor object!");
+    function(internal::forward<_F>(__fn)).swap(*this);
+    return *this;
+  }
+
+  #pragma nv_exec_check_disable
+  __device__ __host__
+  ~function()
+  {
+    __destroy();
+  }
+
+  // 20.8.11.2.2 function modifiers [func.wrap.func.mod]
+  #pragma nv_exec_check_disable 
+  __device__ __host__
+  void swap(function& __fn) __NV_NOEXCEPT
+  {
+    internal::swap(__meta_fn, __fn.__meta_fn);
+    internal::swap(__cloner, __fn.__cloner);
+    internal::swap(__destructor, __fn.__destructor);
+
+    if (__is_small_functor_data() && __fn.__is_small_functor_data()) {
+      internal::swap(__small_functor_data, __fn.__small_functor_data);
+    }
+    else if (__is_small_functor_data()) {
+      internal::swap(__small_functor_data, __fn.__small_functor_data);
+      internal::swap(__obj, __fn.__obj);
+      __fn.__obj = __fn.__get_small_functor_data();
+    }
+    else if (__fn.__is_small_functor_data()) {
+      internal::swap(__small_functor_data, __fn.__small_functor_data);
+      internal::swap(__obj, __fn.__obj);
+      __obj = __get_small_functor_data();
+    }
+    else {
+      internal::swap(__obj, __fn.__obj);
+    }
+  }
+
+  // 20.8.11.2.3 function capacity [func.wrap.func.cap]
+  #pragma nv_exec_check_disable   
+  __device__ __host__
+  explicit operator bool() const __NV_NOEXCEPT
+  {
+    return __obj;
+  }
+
+  // 20.8.11.2.4 function invocation [func.wrap.func.inv]
+  // function::operator() can only be called in device code
+  // to avoid cross-execution space calls
+  #pragma nv_exec_check_disable   
+  __device__ __host__
+  _RetType operator()(_ArgTypes...) const;
+
+};
+
+// Out-of-line definitions
+#pragma nv_exec_check_disable
+template<class _RetType, class... _ArgTypes>
+template<class _F>
+__device__ __host__
+function<_RetType(_ArgTypes...)>::function(_F __fn)
+  : __obj(0), __meta_fn(0), __cloner(0), __destructor(0)
+{
+  static_assert(__check_callability<_F>::value,
+                "Unable to construct functor object!");
+  if (__is_empty_functor(__fn))
+    return;
+  __meta_fn = &__make_functor<_RetType, _F, _ArgTypes...>::__invoke;
+  __cloner = &__make_cloner<_F>::__clone_data;
+  __destructor = &__make_destructor<_F>::__destruct;
+
+  if (__use_small_functor_data<_F>()) {
+    __obj = __get_small_functor_data();
+    new ((void*)__obj) _F(internal::move(__fn));
+  }
+  else {
+    __obj = new _F(internal::move(__fn));
+  }
+}
+
+#pragma nv_exec_check_disable 
+template <class _RetType, class..._ArgTypes>
+__device__ __host__
+_RetType
+function<_RetType(_ArgTypes...)>::operator()(_ArgTypes... __args) const
+{
+  return __meta_fn(__obj, internal::forward<_ArgTypes>(__args)...);
+}
+
+// 20.8.11.2.6, Null pointer comparisons:
+
+#pragma nv_exec_check_disable 
+template <class _R, class... _ArgTypes>
+__device__ __host__
+bool operator==(const function<_R(_ArgTypes...)>& __fn, std::nullptr_t) 
+__NV_NOEXCEPT
+{
+  return !__fn;
+}
+
+#pragma nv_exec_check_disable 
+template <class _R, class... _ArgTypes>
+__device__ __host__
+bool operator==(std::nullptr_t, const function<_R(_ArgTypes...)>& __fn)
+__NV_NOEXCEPT
+{
+  return !__fn;
+}
+
+#pragma nv_exec_check_disable 
+template <class _R, class... _ArgTypes>
+__device__ __host__
+bool operator!=(const function<_R(_ArgTypes...)>& __fn, std::nullptr_t)
+__NV_NOEXCEPT
+{
+  return static_cast<bool>(__fn);
+}
+
+#pragma nv_exec_check_disable 
+template <class _R, class... _ArgTypes>
+__device__ __host__
+bool operator!=(std::nullptr_t, const function<_R(_ArgTypes...)>& __fn)
+__NV_NOEXCEPT
+{
+  return static_cast<bool>(__fn);
+}
+
+// 20.8.11.2.7, specialized algorithms:
+#pragma nv_exec_check_disable 
+template <class _R, class... _ArgTypes>
+__device__ __host__
+void swap(function<_R(_ArgTypes...)>& __fn1, function<_R(_ArgTypes...)>& __fn2)
+{
+  __fn1.swap(__fn2);
+}
+
+} // namespace nvstd
+
+#undef __NV_NOEXCEPT
+#undef __NV_CONSTEXPR
+#undef __NV_ALIGNOF
+
+#endif // __NV_LIBCXX_FUNCTIONAL_H__
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_100_rt.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_100_rt.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d798a6e5392ed631ed3b546304b16c94d65a1c8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_100_rt.h
@@ -0,0 +1,252 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_100_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_100_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_100_RT_H__
+#endif
+
+#if !defined(__SM_100_RT_H__)
+#define __SM_100_RT_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_100_RT_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_100_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 1000
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector fused multiply-add operation
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ * in round-to-nearest-even mode.
+ *
+ * Numeric behavior per component is the same as ::__fmaf_rn().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __ffma2_rn(float2 x, float2 y, float2 z) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector fused multiply-add operation
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ * in round-towards-zero mode.
+ *
+ * Numeric behavior per component is the same as ::__fmaf_rz().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __ffma2_rz(float2 x, float2 y, float2 z) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector fused multiply-add operation
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ * in round-down mode.
+ *
+ * Numeric behavior per component is the same as ::__fmaf_rd().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __ffma2_rd(float2 x, float2 y, float2 z) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector fused multiply-add operation
+ * \cuda_math_formula x \times y + z \end_cuda_math_formula
+ * in round-up mode.
+ *
+ * Numeric behavior per component is the same as ::__fmaf_ru().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __ffma2_ru(float2 x, float2 y, float2 z) __DEF_IF_HOST
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector add operation
+ * \cuda_math_formula x + y \end_cuda_math_formula
+ * in round-to-nearest-even mode.
+ *
+ * Numeric behavior per component is the same as ::__fadd_rn().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fadd2_rn(float2 x, float2 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector add operation
+ * \cuda_math_formula x + y \end_cuda_math_formula
+ * in round-towards-zero mode.
+ *
+ * Numeric behavior per component is the same as ::__fadd_rz().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fadd2_rz(float2 x, float2 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector add operation
+ * \cuda_math_formula x + y \end_cuda_math_formula
+ * in round-down mode.
+ *
+ * Numeric behavior per component is the same as ::__fadd_rd().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fadd2_rd(float2 x, float2 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector add operation
+ * \cuda_math_formula x + y \end_cuda_math_formula
+ * in round-up mode.
+ *
+ * Numeric behavior per component is the same as ::__fadd_ru().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fadd2_ru(float2 x, float2 y) __DEF_IF_HOST
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector multiply operation
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ * in round-to-nearest-even mode.
+ *
+ * Numeric behavior per component is the same as ::__fmul_rn().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fmul2_rn(float2 x, float2 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector multiply operation
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ * in round-towards-zero mode.
+ *
+ * Numeric behavior per component is the same as ::__fmul_rz().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fmul2_rz(float2 x, float2 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector multiply operation
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ * in round-down mode.
+ *
+ * Numeric behavior per component is the same as ::__fmul_rd().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fmul2_rd(float2 x, float2 y) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute vector multiply operation
+ * \cuda_math_formula x \times y \end_cuda_math_formula
+ * in round-up mode.
+ *
+ * Numeric behavior per component is the same as ::__fmul_ru().
+ *
+ * \note_requires_sm100
+ * \note_float2_perf
+ */
+__SM_100_RT_DECL__ float2 __fmul2_ru(float2 x, float2 y) __DEF_IF_HOST
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 1000 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_100_RT_DECL__
+
+#if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
+#include "sm_100_rt.hpp"
+#endif /* (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA) */
+
+#endif /* !__SM_100_RT_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_100_RT_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_100_RT_H__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_100_rt.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_100_rt.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5d620bf0b8091e0ea6cd48da00e8689b92cdd88
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_100_rt.hpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_100_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_100_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_100_RT_HPP__
+#endif
+
+#if !defined(__SM_100_RT_HPP__)
+#define __SM_100_RT_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_100_RT_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_100_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 1000
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-10.0 builtin functions which are included   *
+*  as source (instead of being built in to the compiler)                       *
+*                                                                              *
+*******************************************************************************/
+
+extern "C" {
+  __device__ __device_builtin__ float2 __ffma2_rn_impl(float2 x, float2 y, float2 z);
+  __device__ __device_builtin__ float2 __ffma2_rz_impl(float2 x, float2 y, float2 z);
+  __device__ __device_builtin__ float2 __ffma2_rd_impl(float2 x, float2 y, float2 z);
+  __device__ __device_builtin__ float2 __ffma2_ru_impl(float2 x, float2 y, float2 z);
+
+  __device__ __device_builtin__ float2 __fadd2_rn_impl(float2 x, float2 y);
+  __device__ __device_builtin__ float2 __fadd2_rz_impl(float2 x, float2 y);
+  __device__ __device_builtin__ float2 __fadd2_rd_impl(float2 x, float2 y);
+  __device__ __device_builtin__ float2 __fadd2_ru_impl(float2 x, float2 y);
+
+  __device__ __device_builtin__ float2 __fmul2_rn_impl(float2 x, float2 y);
+  __device__ __device_builtin__ float2 __fmul2_rz_impl(float2 x, float2 y);
+  __device__ __device_builtin__ float2 __fmul2_rd_impl(float2 x, float2 y);
+  __device__ __device_builtin__ float2 __fmul2_ru_impl(float2 x, float2 y);
+} // extern "C"
+
+__SM_100_RT_DECL__ float2 __ffma2_rn(float2 x, float2 y, float2 z) {
+  return __ffma2_rn_impl(x, y, z);
+}
+__SM_100_RT_DECL__ float2 __ffma2_rz(float2 x, float2 y, float2 z) {
+  return __ffma2_rz_impl(x, y, z);
+}
+__SM_100_RT_DECL__ float2 __ffma2_rd(float2 x, float2 y, float2 z) {
+  return __ffma2_rd_impl(x, y, z);
+}
+__SM_100_RT_DECL__ float2 __ffma2_ru(float2 x, float2 y, float2 z) {
+  return __ffma2_ru_impl(x, y, z);
+}
+
+__SM_100_RT_DECL__ float2 __fadd2_rn(float2 x, float2 y) {
+  return __fadd2_rn_impl(x, y);
+}
+__SM_100_RT_DECL__ float2 __fadd2_rz(float2 x, float2 y) {
+  return __fadd2_rz_impl(x, y);
+}
+__SM_100_RT_DECL__ float2 __fadd2_rd(float2 x, float2 y) {
+  return __fadd2_rd_impl(x, y);
+}
+__SM_100_RT_DECL__ float2 __fadd2_ru(float2 x, float2 y) {
+  return __fadd2_ru_impl(x, y);
+}
+
+__SM_100_RT_DECL__ float2 __fmul2_rn(float2 x, float2 y) {
+  return __fmul2_rn_impl(x, y);
+}
+__SM_100_RT_DECL__ float2 __fmul2_rz(float2 x, float2 y) {
+  return __fmul2_rz_impl(x, y);
+}
+__SM_100_RT_DECL__ float2 __fmul2_rd(float2 x, float2 y) {
+  return __fmul2_rd_impl(x, y);
+}
+__SM_100_RT_DECL__ float2 __fmul2_ru(float2 x, float2 y) {
+  return __fmul2_ru_impl(x, y);
+}
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 1000 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_100_RT_DECL__
+
+#endif /* !__SM_100_RT_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_100_RT_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_100_RT_HPP__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.h
new file mode 100644
index 0000000000000000000000000000000000000000..6046953afa8c5f71cf7058436de10397d6353e9e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright 2017-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+ //NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_70_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_70_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__
+#endif
+
+#if !defined(__SM_70_RT_H__)
+#define __SM_70_RT_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_70_RT_DECL__ __host__ __device__
+#elif defined(_NVHPC_CUDA)
+#define __SM_70_RT_DECL__ extern __device__ __cudart_builtin__
+#else /* !__CUDACC_RTC__ */
+#define __SM_70_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+/******************************************************************************
+ *                                   match                                   *
+ ******************************************************************************/
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, int value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long long value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long long value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, float value) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, double value) __DEF_IF_HOST
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, int value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long long value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long long value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, float value, int *pred) __DEF_IF_HOST
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, double value, int *pred) __DEF_IF_HOST
+
+__SM_70_RT_DECL__ void __nanosleep(unsigned int ns) __DEF_IF_HOST
+
+__SM_70_RT_DECL__ unsigned short int atomicCAS(unsigned short int *address, unsigned short int compare, unsigned short int val) __DEF_IF_HOST
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_70_RT_DECL__
+
+#if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
+#include "sm_70_rt.hpp"
+#endif /* (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA) */
+
+#endif /* !__SM_70_RT_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_H__
+#endif
+
+
+#undef EXCLUDE_FROM_RTC
\ No newline at end of file
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..322496587325a1387e4280a509455e3ccc7caa1b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_70_rt.hpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright 2017-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_70_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_70_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__
+#endif
+
+#if !defined(__SM_70_RT_HPP__)
+#define __SM_70_RT_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_70_RT_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_70_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-7.0 builtin functions which are included as *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+//
+// __match_any_sync
+//
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned value) {
+  return __match32_any_sync(mask, value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, int value) {
+  return __match32_any_sync(mask, value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long value) {
+  return (sizeof(long) == sizeof(long long)) ?
+    __match64_any_sync(mask, (unsigned long long)value):
+    __match32_any_sync(mask, (unsigned)value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long value) {
+  return (sizeof(long) == sizeof(long long)) ?
+    __match64_any_sync(mask, (unsigned long long)value):
+    __match32_any_sync(mask, (unsigned)value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, unsigned long long value) {
+  return __match64_any_sync(mask, value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, long long value) {
+  return __match64_any_sync(mask, value);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, float value) {
+  return __match32_any_sync(mask, __float_as_uint(value));
+}
+
+__SM_70_RT_DECL__ unsigned int __match_any_sync(unsigned mask, double value) {
+  return __match64_any_sync(mask, __double_as_longlong(value));
+}
+
+//
+// __match_all_sync
+//
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned value, int *pred) {
+  return __match32_all_sync(mask, value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, int value, int *pred) {
+  return __match32_all_sync(mask, value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long value, int *pred) {
+  return (sizeof(long) == sizeof(long long)) ?
+    __match64_all_sync(mask, (unsigned long long)value, pred):
+    __match32_all_sync(mask, (unsigned)value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long value, int *pred) {
+  return (sizeof(long) == sizeof(long long)) ?
+    __match64_all_sync(mask, (unsigned long long)value, pred):
+    __match32_all_sync(mask, (unsigned)value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, unsigned long long value, int *pred) {
+  return __match64_all_sync(mask, value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, long long value, int *pred) {
+  return __match64_all_sync(mask, value, pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, float value, int *pred) {
+  return __match32_all_sync(mask, __float_as_uint(value), pred);
+}
+
+__SM_70_RT_DECL__ unsigned int __match_all_sync(unsigned mask, double value, int *pred) {
+  return __match64_all_sync(mask, __double_as_longlong(value), pred);
+}
+
+__SM_70_RT_DECL__ void __nanosleep(unsigned int ns) {
+    asm volatile("nanosleep.u32 %0;" :: "r"(ns));
+}
+
+
+extern "C" __device__ __device_builtin__
+unsigned short __usAtomicCAS(unsigned short *, unsigned short, unsigned short);
+
+__SM_70_RT_DECL__ unsigned short int atomicCAS(unsigned short int *address, unsigned short int compare, unsigned short int val) {
+  return __usAtomicCAS(address, compare, val);
+}
+
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 700 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_70_RT_DECL__
+
+#endif /* !__SM_70_RT_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_70_RT_HPP__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e250634fe76651c2a15b5b492378efec1d3e0c5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2022-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_90_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_90_rt.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_H__
+#endif
+
+#if !defined(__SM_90_RT_H__)
+#define __SM_90_RT_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_90_RT_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_90_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+__SM_90_RT_DECL__ unsigned __isCtaShared(const void *ptr) __DEF_IF_HOST
+__SM_90_RT_DECL__ unsigned __isClusterShared(const void *ptr) __DEF_IF_HOST
+__SM_90_RT_DECL__ void *__cluster_map_shared_rank(const void *ptr, unsigned target_block_rank)  __DEF_IF_HOST
+__SM_90_RT_DECL__ unsigned __cluster_query_shared_rank(const void *ptr) __DEF_IF_HOST
+__SM_90_RT_DECL__ uint2 __cluster_map_shared_multicast(const void *ptr, unsigned cluster_cta_mask) __DEF_IF_HOST
+__SM_90_RT_DECL__ unsigned __clusterDimIsSpecified() __DEF_IF_HOST
+__SM_90_RT_DECL__ dim3 __clusterDim() __DEF_IF_HOST
+__SM_90_RT_DECL__ dim3 __clusterRelativeBlockIdx() __DEF_IF_HOST
+__SM_90_RT_DECL__ dim3 __clusterGridDimInClusters() __DEF_IF_HOST
+__SM_90_RT_DECL__ dim3 __clusterIdx() __DEF_IF_HOST
+__SM_90_RT_DECL__ unsigned __clusterRelativeBlockRank() __DEF_IF_HOST
+__SM_90_RT_DECL__ unsigned __clusterSizeInBlocks() __DEF_IF_HOST
+__SM_90_RT_DECL__ void __cluster_barrier_arrive() __DEF_IF_HOST
+__SM_90_RT_DECL__ void __cluster_barrier_arrive_relaxed() __DEF_IF_HOST
+__SM_90_RT_DECL__ void __cluster_barrier_wait() __DEF_IF_HOST
+__SM_90_RT_DECL__ void __threadfence_cluster() __DEF_IF_HOST
+
+__SM_90_RT_DECL__ float2 atomicAdd(float2 *__address, float2 val) __DEF_IF_HOST
+__SM_90_RT_DECL__ float2 atomicAdd_block(float2 *__address, float2 val) __DEF_IF_HOST
+__SM_90_RT_DECL__ float2 atomicAdd_system(float2 *__address, float2 val) __DEF_IF_HOST
+__SM_90_RT_DECL__ float4 atomicAdd(float4 *__address, float4 val) __DEF_IF_HOST
+__SM_90_RT_DECL__ float4 atomicAdd_block(float4 *__address, float4 val) __DEF_IF_HOST
+__SM_90_RT_DECL__ float4 atomicAdd_system(float4 *__address, float4 val) __DEF_IF_HOST
+
+#undef EXCLUDE_FROM_RTC
+
+//Note: below atomic functions are templates, so cannot be represented in NVRTC
+//builtins representation, so they have to be parsed on every NVRTC compilation.
+//(notice 'EXCLUDE_FROM_RTC' ends above)
+
+
+#ifndef __NV_DISABLE_128_ATOMICS
+// lgen definitions for 128b atomics
+extern "C" {
+  __device__ __device_builtin__ void __u128AtomicCAS(void *, void *, void *, void *);
+  __device__ __device_builtin__ void __u128AtomicCAS_block(void *, void *, void *, void *);
+  __device__ __device_builtin__ void __u128AtomicCAS_system(void *, void *, void *, void *);
+  __device__ __device_builtin__ void __u128AtomicExch(void *, void *, void *);
+  __device__ __device_builtin__ void __u128AtomicExch_block(void *, void *, void *);
+  __device__ __device_builtin__ void __u128AtomicExch_system(void *, void *, void *);
+}
+
+// macro to get address of object, to workaround situations where the type overloads the "&" operator
+#define __NV_ATOMIC_ADDRESSOF(__val) \
+        (void *)(&(const_cast<char &>(reinterpret_cast<const volatile char &>(__val))))
+
+// enable_if
+template<bool __b, typename _T>
+struct __nv_atomic_enable_if { };
+
+template<typename _T>
+struct __nv_atomic_enable_if<true, _T> { typedef _T __type; };
+
+// alignof
+#if defined(__CUDACC_RTC__)
+#define __NV_ATOMIC_ALIGNOF __alignof__
+#else
+#define __NV_ATOMIC_ALIGNOF __alignof
+#endif
+
+// trivially copyable
+template <typename _T>
+struct __nv_atomic_triv_cp_helper {
+#if defined(__GNUC__)
+#if  (__GNUC__ < 4) || (__GNUC__ == 4 && __GNUC_MINOR__ < 3)
+  static const bool __val = true;
+#elif (__GNUC__ < 5)
+  static const bool __val = __has_trivial_copy(_T);
+#else
+  static const bool __val = __is_trivially_copyable(_T);
+#endif
+#else
+  static const bool __val = __is_trivially_copyable(_T);
+#endif
+};
+#define __NV_ATOMIC_TRIVIALLY_COPYABLE(_T) \
+        __nv_atomic_triv_cp_helper<_T>::__val
+
+// return type
+#if __cplusplus >= 202002L // C++20 or greater
+#define __NV_ATOMIC_RET_TYPE(_T) _T
+#else
+#define __NV_ATOMIC_RET_TYPE(_T) typename \
+  __nv_atomic_enable_if<sizeof(_T) == 16 && \
+  __NV_ATOMIC_ALIGNOF(_T) >= 16 && \
+  __NV_ATOMIC_TRIVIALLY_COPYABLE(_T), _T>::__type
+#endif
+
+// requires
+#if __cplusplus >= 202002L // C++20 or greater
+#define __NV_ATOMIC_REQUIRES(_T) \
+  requires(sizeof(_T) == 16 && \
+  __NV_ATOMIC_ALIGNOF(_T) >= 16 && \
+  __NV_ATOMIC_TRIVIALLY_COPYABLE(_T))
+#else
+#define __NV_ATOMIC_REQUIRES(_T)
+#endif
+
+// temp value and return value
+#if __cplusplus >= 201103L || defined(_MSC_VER) // C++11 or greater, or MSC
+#define __NV_ATOMIC_TEMP(_T) union _U \
+  {_T __ret; __device__ __inline__ _U() {}}; _U __u
+#define __NV_ATOMIC_RET(_T) __u.__ret
+#else
+#define __NV_ATOMIC_TEMP(_T) _T __ret
+#define __NV_ATOMIC_RET(_T) __ret
+#endif
+
+// templated 128-bit atomics
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicCAS(_T *__address, _T __compare, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicCAS((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__compare),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicCAS_block(_T *__address, _T __compare, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicCAS_block((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__compare),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicCAS_system(_T *__address, _T __compare, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicCAS_system((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__compare),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicExch(_T *__address, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicExch((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicExch_block(_T *__address, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicExch_block((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+
+template <typename _T>
+__SM_90_RT_DECL__ __NV_ATOMIC_RET_TYPE(_T)
+atomicExch_system(_T *__address, _T __val) __NV_ATOMIC_REQUIRES(_T) {
+  __NV_ATOMIC_TEMP(_T);
+  __u128AtomicExch_system((void *)(__address),
+                  __NV_ATOMIC_ADDRESSOF(__val),
+                  __NV_ATOMIC_ADDRESSOF(__NV_ATOMIC_RET(_T)));
+  return __NV_ATOMIC_RET(_T);
+}
+#endif /* !__NV_DISABLE_128_ATOMICS */
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 900 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_90_RT_DECL__
+
+#if (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA)
+#include "sm_90_rt.hpp"
+#endif /* (!defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)) || defined(_NVHPC_CUDA) */
+
+#endif /* !__SM_90_RT_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_H__
+#endif
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e61ac78b996fa03cadf60208bbd58f2e781f3ec
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/sm_90_rt.hpp
@@ -0,0 +1,248 @@
+/*
+ * Copyright 2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/sm_90_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/sm_90_rt.hpp is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_HPP__
+#endif
+
+#if !defined(__SM_90_RT_HPP__)
+#define __SM_90_RT_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_90_RT_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_90_RT_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "device_types.h"
+#include "host_defines.h"
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-9.0 builtin functions which are included as *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+extern "C" {
+  __device__ unsigned  __nv_isClusterShared_impl(const void *);
+  __device__ void * __nv_cluster_map_shared_rank_impl(const void *, unsigned);
+  __device__ unsigned __nv_cluster_query_shared_rank_impl(const void *);
+  __device__ unsigned __nv_clusterDimIsSpecifed_impl();
+  __device__ void __nv_clusterDim_impl(unsigned *, unsigned *, unsigned *);
+  __device__ void __nv_clusterRelativeBlockIdx_impl(unsigned *, 
+                                                    unsigned *, unsigned *);
+  __device__ void __nv_clusterGridDimInClusters_impl(unsigned *, 
+                                                     unsigned *, unsigned *);
+  __device__ void __nv_clusterIdx_impl(unsigned *, unsigned *, unsigned *);
+  __device__ unsigned __nv_clusterRelativeBlockRank_impl();
+  __device__ unsigned __nv_clusterSizeInBlocks_impl();
+  __device__ void __nv_cluster_barrier_arrive_impl();
+  __device__ void __nv_cluster_barrier_arrive_relaxed_impl();
+  __device__ void __nv_cluster_barrier_wait_impl();
+  __device__ void __nv_threadfence_cluster_impl();
+
+  __device__ __device_builtin__ float2 __f2AtomicAdd(float2 *, float2);
+  __device__ __device_builtin__ float2 __f2AtomicAdd_block(float2 *, float2);
+  __device__ __device_builtin__ float2 __f2AtomicAdd_system(float2 *, float2);
+  __device__ __device_builtin__ float4 __f4AtomicAdd(float4 *, float4);
+  __device__ __device_builtin__ float4 __f4AtomicAdd_block(float4 *, float4);
+  __device__ __device_builtin__ float4 __f4AtomicAdd_system(float4 *, float4);
+} // extern "C"
+
+__SM_90_RT_DECL__  unsigned __isCtaShared(const void *ptr) 
+{
+  return __isShared(ptr);
+}
+
+__SM_90_RT_DECL__ unsigned __isClusterShared(const void *ptr) 
+{
+  return __nv_isClusterShared_impl(ptr);
+}
+
+__SM_90_RT_DECL__ void *__cluster_map_shared_rank(const void *ptr, 
+                                                  unsigned target_block_rank)
+{
+  return __nv_cluster_map_shared_rank_impl(ptr, target_block_rank);
+}
+
+__SM_90_RT_DECL__ unsigned __cluster_query_shared_rank(const void *ptr)
+{
+  return __nv_cluster_query_shared_rank_impl(ptr);
+}
+
+__SM_90_RT_DECL__ uint2 __cluster_map_shared_multicast(const void *ptr, 
+                                                 unsigned int cluster_cta_mask)
+{
+  return make_uint2((unsigned)__cvta_generic_to_shared(ptr), cluster_cta_mask);
+}
+
+__SM_90_RT_DECL__ unsigned __clusterDimIsSpecified()
+{
+  return __nv_clusterDimIsSpecifed_impl();
+}  
+
+__SM_90_RT_DECL__ dim3 __clusterDim()
+{
+  unsigned x, y, z;
+  __nv_clusterDim_impl(&x, &y, &z);
+  return dim3(x,y,z);
+}
+
+__SM_90_RT_DECL__ dim3 __clusterRelativeBlockIdx()
+{
+  unsigned x, y, z;
+  __nv_clusterRelativeBlockIdx_impl(&x, &y, &z);
+  return dim3(x,y,z);
+}
+
+__SM_90_RT_DECL__ dim3 __clusterGridDimInClusters()
+{
+  unsigned x, y, z;
+  __nv_clusterGridDimInClusters_impl(&x, &y, &z);
+  return dim3(x,y,z);
+}
+
+__SM_90_RT_DECL__ dim3 __clusterIdx()
+{
+  unsigned x, y, z;
+  __nv_clusterIdx_impl(&x, &y, &z);
+  return dim3(x,y,z);
+}
+
+__SM_90_RT_DECL__ unsigned __clusterRelativeBlockRank()
+{
+  return __nv_clusterRelativeBlockRank_impl();
+}
+
+__SM_90_RT_DECL__ unsigned __clusterSizeInBlocks()
+{
+  return __nv_clusterSizeInBlocks_impl();
+}
+
+__SM_90_RT_DECL__ void __cluster_barrier_arrive()
+{
+  __nv_cluster_barrier_arrive_impl();
+}
+
+__SM_90_RT_DECL__ void __cluster_barrier_arrive_relaxed()
+{
+  __nv_cluster_barrier_arrive_relaxed_impl();
+}
+
+__SM_90_RT_DECL__ void __cluster_barrier_wait()
+{
+  __nv_cluster_barrier_wait_impl();
+}
+
+__SM_90_RT_DECL__ void __threadfence_cluster()
+{
+  __nv_threadfence_cluster_impl();
+}
+
+
+/* Define __PTR for atomicAdd prototypes below, undef after done */
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __PTR   "l"
+#else
+#define __PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+
+__SM_90_RT_DECL__ float2 atomicAdd(float2 *address, float2 val) {
+  return __f2AtomicAdd(address, val);
+}
+
+__SM_90_RT_DECL__ float2 atomicAdd_block(float2 *address, float2 val) {
+  return __f2AtomicAdd_block(address, val);
+}
+
+__SM_90_RT_DECL__ float2 atomicAdd_system(float2 *address, float2 val) {
+  return __f2AtomicAdd_system(address, val);
+}
+
+__SM_90_RT_DECL__ float4 atomicAdd(float4 *address, float4 val) {
+  return __f4AtomicAdd(address, val);
+}
+
+__SM_90_RT_DECL__ float4 atomicAdd_block(float4 *address, float4 val) {
+  return __f4AtomicAdd_block(address, val);
+}
+
+__SM_90_RT_DECL__ float4 atomicAdd_system(float4 *address, float4 val) {
+  return __f4AtomicAdd_system(address, val);
+}
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 900 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_90_RT_DECL__
+
+#endif /* !__SM_90_RT_HPP__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_HPP__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_SM_90_RT_HPP__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/storage_class.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/storage_class.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fb19bd46ebde4a53dfad866050fad9fb0cbd222
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/crt/storage_class.h
@@ -0,0 +1,142 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2008-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("crt/storage_class.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "crt/storage_class.h is an internal header file and must not be used directly.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__
+#endif
+
+#if !defined(__STORAGE_CLASS_H__)
+#define __STORAGE_CLASS_H__
+
+#if !defined(__var_used__)
+
+#define __var_used__
+
+#endif /* __var_used__ */
+
+#if !defined(__loc_sc__)
+
+#define __loc_sc__(loc, size, sc) \
+        __storage##_##sc##size##loc loc
+
+#endif /* !__loc_sc__ */
+
+#if !defined(__storage___device__)
+#define __storage___device__ static __var_used__
+#endif /* __storage___device__ */
+
+#if !defined(__storage_extern__device__)
+#define __storage_extern__device__ static __var_used__
+#endif /* __storage_extern__device__ */
+
+#if !defined(__storage_auto__device__)
+#define __storage_auto__device__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_auto__device__ */
+
+#if !defined(__storage_static__device__)
+#define __storage_static__device__ static __var_used__
+#endif /* __storage_static__device__ */
+
+#if !defined(__storage___constant__)
+#define __storage___constant__ static __var_used__
+#endif /* __storage___constant__ */
+
+#if !defined(__storage_extern__constant__)
+#define __storage_extern__constant__ static __var_used__
+#endif /* __storage_extern__constant__ */
+
+#if !defined(__storage_auto__constant__)
+#define __storage_auto__constant__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_auto__constant__ */
+
+#if !defined(__storage_static__constant__)
+#define __storage_static__constant__ static __var_used__
+#endif /* __storage_static__constant__ */
+
+#if !defined(__storage___shared__)
+#define __storage___shared__ static __var_used__
+#endif /* __storage___shared__ */
+
+#if !defined(__storage_extern__shared__)
+#define __storage_extern__shared__ static __var_used__
+#endif /* __storage_extern__shared__ */
+
+#if !defined(__storage_auto__shared__)
+#define __storage_auto__shared__ static
+#endif /* __storage_auto__shared__ */
+
+#if !defined(__storage_static__shared__)
+#define __storage_static__shared__ static __var_used__
+#endif /* __storage_static__shared__ */
+
+#if !defined(__storage__unsized__shared__)
+#define __storage__unsized__shared__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage__unsized__shared__ */
+
+#if !defined(__storage_extern_unsized__shared__)
+#define __storage_extern_unsized__shared__ static __var_used__
+#endif /* __storage_extern_unsized__shared__ */
+
+#if !defined(__storage_auto_unsized__shared__)
+#define __storage_auto_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_auto_unsized__shared__ */
+
+#if !defined(__storage_static_unsized__shared__)
+#define __storage_static_unsized__shared__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_static_unsized__shared__ */
+
+#if !defined(__storage___text__)
+#define __storage___text__ static __var_used__
+#endif /* __storage___text__ */
+
+#if !defined(__storage_extern__text__)
+#define __storage_extern__text__ static __var_used__
+#endif /* __storage_extern__text__ */
+
+#if !defined(__storage_auto__text__)
+#define __storage_auto__text__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_auto__text__ */
+
+#if !defined(__storage_static__text__)
+#define __storage_static__text__ static __var_used__
+#endif /* __storage_static__text__ */
+
+#if !defined(__storage___surf__)
+#define __storage___surf__ static __var_used__
+#endif /* __storage___surf__ */
+
+#if !defined(__storage_extern__surf__)
+#define __storage_extern__surf__ static __var_used__
+#endif /* __storage_extern__surf__ */
+
+#if !defined(__storage_auto__surf__)
+#define __storage_auto__surf__ @@@ COMPILER @@@ ERROR @@@
+#endif /* __storage_auto__surf__ */
+
+#if !defined(__storage_static__surf__)
+#define __storage_static__surf__ static __var_used__
+#endif /* __storage_static__surf__ */
+
+#endif /* !__STORAGE_CLASS_H__ */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_STORAGE_CLASS_H__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuComplex.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuComplex.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b167111b0b387a5279da6749d946560e1c42c1b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuComplex.h
@@ -0,0 +1,348 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(CU_COMPLEX_H_)
+#define CU_COMPLEX_H_
+
+#if !defined(__CUDACC_RTC__)
+#if defined(__GNUC__)
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+#endif
+#endif
+
+/* When trying to include C header file in C++ Code extern "C" is required
+ * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
+ * extern "C" cannot be nested
+ * Hence keep the header out of extern "C" block
+ */
+
+#if !defined(__CUDACC__)
+#include <math.h>       /* import fabsf, sqrt */
+#endif /* !defined(__CUDACC__) */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+#include "vector_types.h"
+
+typedef float2 cuFloatComplex;
+
+__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x) 
+{ 
+    return x.x; 
+}
+
+__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x) 
+{ 
+    return x.y; 
+}
+
+__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex 
+                                                             (float r, float i)
+{
+    cuFloatComplex res;
+    res.x = r;
+    res.y = i;
+    return res;
+}
+
+__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
+{
+    return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
+}
+__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y), 
+                                cuCimagf(x) + cuCimagf(y));
+}
+
+__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+        return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y), 
+                                    cuCimagf(x) - cuCimagf(y));
+}
+
+/* This implementation could suffer from intermediate overflow even though
+ * the final result would be in range. However, various implementations do
+ * not guard against this (presumably to avoid losing performance), so we 
+ * don't do it either to stay competitive.
+ */
+__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    cuFloatComplex prod;
+    prod = make_cuFloatComplex  ((cuCrealf(x) * cuCrealf(y)) - 
+                                 (cuCimagf(x) * cuCimagf(y)),
+                                 (cuCrealf(x) * cuCimagf(y)) + 
+                                 (cuCimagf(x) * cuCrealf(y)));
+    return prod;
+}
+
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Such guarded implementations are usually the default for
+ * complex library implementations, with some also offering an unguarded,
+ * faster version.
+ */
+__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    cuFloatComplex quot;
+    float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
+    float oos = 1.0f / s;
+    float ars = cuCrealf(x) * oos;
+    float ais = cuCimagf(x) * oos;
+    float brs = cuCrealf(y) * oos;
+    float bis = cuCimagf(y) * oos;
+    s = (brs * brs) + (bis * bis);
+    oos = 1.0f / s;
+    quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
+                                ((ais * brs) - (ars * bis)) * oos);
+    return quot;
+}
+
+/* 
+ * We would like to call hypotf(), but it's not available on all platforms.
+ * This discrete implementation guards against intermediate underflow and 
+ * overflow by scaling. Otherwise we would lose half the exponent range. 
+ * There are various ways of doing guarded computation. For now chose the 
+ * simplest and fastest solution, however this may suffer from inaccuracies 
+ * if sqrt and division are not IEEE compliant. 
+ */
+__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
+{
+    float a = cuCrealf(x);
+    float b = cuCimagf(x);
+    float v, w, t;
+    a = fabsf(a);
+    b = fabsf(b);
+    if (a > b) {
+        v = a;
+        w = b; 
+    } else {
+        v = b;
+        w = a;
+    }
+    t = w / v;
+    t = 1.0f + t * t;
+    t = v * sqrtf(t);
+    if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
+        t = v + w;
+    }
+    return t;
+}
+
+/* Double precision */
+typedef double2 cuDoubleComplex;
+
+__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x) 
+{ 
+    return x.x; 
+}
+
+__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x) 
+{ 
+    return x.y; 
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex 
+                                                           (double r, double i)
+{
+    cuDoubleComplex res;
+    res.x = r;
+    res.y = i;
+    return res;
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
+{
+    return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    return make_cuDoubleComplex (cuCreal(x) + cuCreal(y), 
+                                 cuCimag(x) + cuCimag(y));
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y), 
+                                 cuCimag(x) - cuCimag(y));
+}
+
+/* This implementation could suffer from intermediate overflow even though
+ * the final result would be in range. However, various implementations do
+ * not guard against this (presumably to avoid losing performance), so we 
+ * don't do it either to stay competitive.
+ */
+__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    cuDoubleComplex prod;
+    prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) - 
+                                 (cuCimag(x) * cuCimag(y)),
+                                 (cuCreal(x) * cuCimag(y)) + 
+                                 (cuCimag(x) * cuCreal(y)));
+    return prod;
+}
+
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Such guarded implementations are usually the default for
+ * complex library implementations, with some also offering an unguarded,
+ * faster version.
+ */
+__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    cuDoubleComplex quot;
+    double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
+    double oos = 1.0 / s;
+    double ars = cuCreal(x) * oos;
+    double ais = cuCimag(x) * oos;
+    double brs = cuCreal(y) * oos;
+    double bis = cuCimag(y) * oos;
+    s = (brs * brs) + (bis * bis);
+    oos = 1.0 / s;
+    quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
+                                 ((ais * brs) - (ars * bis)) * oos);
+    return quot;
+}
+
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Otherwise we would lose half the exponent range. There are
+ * various ways of doing guarded computation. For now chose the simplest
+ * and fastest solution, however this may suffer from inaccuracies if sqrt
+ * and division are not IEEE compliant.
+ */
+__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
+{
+    double a = cuCreal(x);
+    double b = cuCimag(x);
+    double v, w, t;
+    a = fabs(a);
+    b = fabs(b);
+    if (a > b) {
+        v = a;
+        w = b; 
+    } else {
+        v = b;
+        w = a;
+    }
+    t = w / v;
+    t = 1.0 + t * t;
+    t = v * sqrt(t);
+    if ((v == 0.0) || 
+        (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
+        t = v + w;
+    }
+    return t;
+}
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+/* aliases */
+typedef cuFloatComplex cuComplex;
+__host__ __device__ static __inline__ cuComplex make_cuComplex (float x, 
+                                                                float y) 
+{ 
+    return make_cuFloatComplex (x, y); 
+}
+
+/* float-to-double promotion */
+__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
+                                                      (cuFloatComplex c)
+{
+    return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
+}
+
+__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
+(cuDoubleComplex c)
+{
+	return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
+}
+
+
+__host__ __device__ static __inline__  cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
+{
+    float real_res;
+    float imag_res;
+    
+    real_res = (cuCrealf(x) *  cuCrealf(y)) + cuCrealf(d);
+    imag_res = (cuCrealf(x) *  cuCimagf(y)) + cuCimagf(d);
+            
+    real_res = -(cuCimagf(x) * cuCimagf(y))  + real_res;  
+    imag_res =  (cuCimagf(x) *  cuCrealf(y)) + imag_res;          
+     
+    return make_cuComplex(real_res, imag_res);
+}
+
+__host__ __device__ static __inline__  cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
+{
+    double real_res;
+    double imag_res;
+    
+    real_res = (cuCreal(x) *  cuCreal(y)) + cuCreal(d);
+    imag_res = (cuCreal(x) *  cuCimag(y)) + cuCimag(d);
+            
+    real_res = -(cuCimag(x) * cuCimag(y))  + real_res;  
+    imag_res =  (cuCimag(x) *  cuCreal(y)) + imag_res;     
+     
+    return make_cuDoubleComplex(real_res, imag_res);
+}
+
+#endif /* !defined(CU_COMPLEX_H_) */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..66c74d8c48d2a80fdfbccb3dca0c992c59c1d0ff
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda.h
@@ -0,0 +1,26280 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef __cuda_cuda_h__
+#define __cuda_cuda_h__
+
+
+
+
+#include <stdlib.h>
+#ifdef _MSC_VER
+typedef unsigned __int32 cuuint32_t;
+typedef unsigned __int64 cuuint64_t;
+#else
+#include <stdint.h>
+typedef uint32_t cuuint32_t;
+typedef uint64_t cuuint64_t;
+#endif
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+
+#if defined(CUDA_FORCE_API_VERSION)
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __CUDA_API_PER_THREAD_DEFAULT_STREAM
+    #define __CUDA_API_PTDS(api) api ## _ptds
+    #define __CUDA_API_PTSZ(api) api ## _ptsz
+#else
+    #define __CUDA_API_PTDS(api) api
+    #define __CUDA_API_PTSZ(api) api
+#endif
+
+#define cuDeviceTotalMem                    cuDeviceTotalMem_v2
+#define cuCtxCreate                         cuCtxCreate_v2
+#define cuCtxCreate_v3                      cuCtxCreate_v3
+#define cuCtxCreate_v4                      cuCtxCreate_v4
+#define cuModuleGetGlobal                   cuModuleGetGlobal_v2
+#define cuMemGetInfo                        cuMemGetInfo_v2
+#define cuMemAlloc                          cuMemAlloc_v2
+#define cuMemAllocPitch                     cuMemAllocPitch_v2
+#define cuMemFree                           cuMemFree_v2
+#define cuMemGetAddressRange                cuMemGetAddressRange_v2
+#define cuMemAllocHost                      cuMemAllocHost_v2
+#define cuMemHostGetDevicePointer           cuMemHostGetDevicePointer_v2
+#define cuMemcpyHtoD                        __CUDA_API_PTDS(cuMemcpyHtoD_v2)
+#define cuMemcpyDtoH                        __CUDA_API_PTDS(cuMemcpyDtoH_v2)
+#define cuMemcpyDtoD                        __CUDA_API_PTDS(cuMemcpyDtoD_v2)
+#define cuMemcpyDtoA                        __CUDA_API_PTDS(cuMemcpyDtoA_v2)
+#define cuMemcpyAtoD                        __CUDA_API_PTDS(cuMemcpyAtoD_v2)
+#define cuMemcpyHtoA                        __CUDA_API_PTDS(cuMemcpyHtoA_v2)
+#define cuMemcpyAtoH                        __CUDA_API_PTDS(cuMemcpyAtoH_v2)
+#define cuMemcpyAtoA                        __CUDA_API_PTDS(cuMemcpyAtoA_v2)
+#define cuMemcpyHtoAAsync                   __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2)
+#define cuMemcpyAtoHAsync                   __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2)
+#define cuMemcpy2D                          __CUDA_API_PTDS(cuMemcpy2D_v2)
+#define cuMemcpy2DUnaligned                 __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2)
+#define cuMemcpy3D                          __CUDA_API_PTDS(cuMemcpy3D_v2)
+#define cuMemcpyHtoDAsync                   __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2)
+#define cuMemcpyDtoHAsync                   __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2)
+#define cuMemcpyDtoDAsync                   __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2)
+#define cuMemcpy2DAsync                     __CUDA_API_PTSZ(cuMemcpy2DAsync_v2)
+#define cuMemcpy3DAsync                     __CUDA_API_PTSZ(cuMemcpy3DAsync_v2)
+#define cuMemcpyBatchAsync                  __CUDA_API_PTSZ(cuMemcpyBatchAsync)
+#define cuMemcpy3DBatchAsync                __CUDA_API_PTSZ(cuMemcpy3DBatchAsync)
+#define cuMemsetD8                          __CUDA_API_PTDS(cuMemsetD8_v2)
+#define cuMemsetD16                         __CUDA_API_PTDS(cuMemsetD16_v2)
+#define cuMemsetD32                         __CUDA_API_PTDS(cuMemsetD32_v2)
+#define cuMemsetD2D8                        __CUDA_API_PTDS(cuMemsetD2D8_v2)
+#define cuMemsetD2D16                       __CUDA_API_PTDS(cuMemsetD2D16_v2)
+#define cuMemsetD2D32                       __CUDA_API_PTDS(cuMemsetD2D32_v2)
+#define cuArrayCreate                       cuArrayCreate_v2
+#define cuArrayGetDescriptor                cuArrayGetDescriptor_v2
+#define cuArray3DCreate                     cuArray3DCreate_v2
+#define cuArray3DGetDescriptor              cuArray3DGetDescriptor_v2
+#define cuTexRefSetAddress                  cuTexRefSetAddress_v2
+#define cuTexRefGetAddress                  cuTexRefGetAddress_v2
+#define cuGraphicsResourceGetMappedPointer  cuGraphicsResourceGetMappedPointer_v2
+#define cuCtxDestroy                        cuCtxDestroy_v2
+#define cuCtxPopCurrent                     cuCtxPopCurrent_v2
+#define cuCtxPushCurrent                    cuCtxPushCurrent_v2
+#define cuStreamDestroy                     cuStreamDestroy_v2
+#define cuEventDestroy                      cuEventDestroy_v2
+#define cuTexRefSetAddress2D                cuTexRefSetAddress2D_v3
+#define cuLinkCreate                        cuLinkCreate_v2
+#define cuLinkAddData                       cuLinkAddData_v2
+#define cuLinkAddFile                       cuLinkAddFile_v2
+#define cuMemHostRegister                   cuMemHostRegister_v2
+#define cuGraphicsResourceSetMapFlags       cuGraphicsResourceSetMapFlags_v2
+#define cuStreamBeginCapture                __CUDA_API_PTSZ(cuStreamBeginCapture_v2)
+#define cuDevicePrimaryCtxRelease           cuDevicePrimaryCtxRelease_v2
+#define cuDevicePrimaryCtxReset             cuDevicePrimaryCtxReset_v2
+#define cuDevicePrimaryCtxSetFlags          cuDevicePrimaryCtxSetFlags_v2
+#define cuDeviceGetUuid_v2                  cuDeviceGetUuid_v2
+#define cuIpcOpenMemHandle                  cuIpcOpenMemHandle_v2
+
+#define cuGraphInstantiate                  cuGraphInstantiateWithFlags
+
+#define cuGraphExecUpdate                   cuGraphExecUpdate_v2 
+#define cuGetProcAddress                    cuGetProcAddress_v2
+#define cuGraphAddKernelNode                cuGraphAddKernelNode_v2
+#define cuGraphKernelNodeGetParams          cuGraphKernelNodeGetParams_v2
+#define cuGraphKernelNodeSetParams          cuGraphKernelNodeSetParams_v2
+#define cuGraphExecKernelNodeSetParams      cuGraphExecKernelNodeSetParams_v2
+
+#define cuStreamWriteValue32                __CUDA_API_PTSZ(cuStreamWriteValue32_v2)
+#define cuStreamWaitValue32                 __CUDA_API_PTSZ(cuStreamWaitValue32_v2)
+#define cuStreamWriteValue64                __CUDA_API_PTSZ(cuStreamWriteValue64_v2)
+#define cuStreamWaitValue64                 __CUDA_API_PTSZ(cuStreamWaitValue64_v2)
+#define cuStreamBatchMemOp                  __CUDA_API_PTSZ(cuStreamBatchMemOp_v2)
+#define cuStreamGetCaptureInfo              __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2)
+#define cuStreamGetCaptureInfo_v2           __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2)
+
+#if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define cuMemcpy                            __CUDA_API_PTDS(cuMemcpy)
+    #define cuMemcpyAsync                       __CUDA_API_PTSZ(cuMemcpyAsync)
+    #define cuMemcpyPeer                        __CUDA_API_PTDS(cuMemcpyPeer)
+    #define cuMemcpyPeerAsync                   __CUDA_API_PTSZ(cuMemcpyPeerAsync)
+    #define cuMemcpy3DPeer                      __CUDA_API_PTDS(cuMemcpy3DPeer)
+    #define cuMemcpy3DPeerAsync                 __CUDA_API_PTSZ(cuMemcpy3DPeerAsync)
+    #define cuMemPrefetchAsync                  __CUDA_API_PTSZ(cuMemPrefetchAsync)
+    #define cuMemPrefetchAsync_v2               __CUDA_API_PTSZ(cuMemPrefetchAsync_v2)
+
+    #define cuMemsetD8Async                     __CUDA_API_PTSZ(cuMemsetD8Async)
+    #define cuMemsetD16Async                    __CUDA_API_PTSZ(cuMemsetD16Async)
+    #define cuMemsetD32Async                    __CUDA_API_PTSZ(cuMemsetD32Async)
+    #define cuMemsetD2D8Async                   __CUDA_API_PTSZ(cuMemsetD2D8Async)
+    #define cuMemsetD2D16Async                  __CUDA_API_PTSZ(cuMemsetD2D16Async)
+    #define cuMemsetD2D32Async                  __CUDA_API_PTSZ(cuMemsetD2D32Async)
+
+    #define cuStreamGetPriority                 __CUDA_API_PTSZ(cuStreamGetPriority)
+    #define cuStreamGetId                       __CUDA_API_PTSZ(cuStreamGetId)
+    #define cuStreamGetFlags                    __CUDA_API_PTSZ(cuStreamGetFlags)
+    #define cuStreamGetDevice                   __CUDA_API_PTSZ(cuStreamGetDevice)
+    #define cuStreamGetCtx                      __CUDA_API_PTSZ(cuStreamGetCtx)
+    #define cuStreamGetCtx_v2                   __CUDA_API_PTSZ(cuStreamGetCtx_v2)
+    #define cuStreamWaitEvent                   __CUDA_API_PTSZ(cuStreamWaitEvent)
+    #define cuStreamEndCapture                  __CUDA_API_PTSZ(cuStreamEndCapture)
+    #define cuStreamIsCapturing                 __CUDA_API_PTSZ(cuStreamIsCapturing)
+    #define cuStreamGetCaptureInfo_v3           __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v3)
+    #define cuStreamUpdateCaptureDependencies   __CUDA_API_PTSZ(cuStreamUpdateCaptureDependencies)
+    #define cuStreamUpdateCaptureDependencies_v2 __CUDA_API_PTSZ(cuStreamUpdateCaptureDependencies_v2)
+    #define cuStreamAddCallback                 __CUDA_API_PTSZ(cuStreamAddCallback)
+    #define cuStreamAttachMemAsync              __CUDA_API_PTSZ(cuStreamAttachMemAsync)
+    #define cuStreamQuery                       __CUDA_API_PTSZ(cuStreamQuery)
+    #define cuStreamSynchronize                 __CUDA_API_PTSZ(cuStreamSynchronize)
+    #define cuEventRecord                       __CUDA_API_PTSZ(cuEventRecord)
+    #define cuEventRecordWithFlags              __CUDA_API_PTSZ(cuEventRecordWithFlags)
+    #define cuLaunchKernel                      __CUDA_API_PTSZ(cuLaunchKernel)
+    #define cuLaunchKernelEx                    __CUDA_API_PTSZ(cuLaunchKernelEx)
+    #define cuLaunchHostFunc                    __CUDA_API_PTSZ(cuLaunchHostFunc)
+    #define cuGraphicsMapResources              __CUDA_API_PTSZ(cuGraphicsMapResources)
+    #define cuGraphicsUnmapResources            __CUDA_API_PTSZ(cuGraphicsUnmapResources)
+
+
+    #define cuLaunchCooperativeKernel           __CUDA_API_PTSZ(cuLaunchCooperativeKernel)
+
+    #define cuSignalExternalSemaphoresAsync     __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync)
+    #define cuWaitExternalSemaphoresAsync       __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync)
+
+    #define cuGraphInstantiateWithParams        __CUDA_API_PTSZ(cuGraphInstantiateWithParams)
+    #define cuGraphUpload                       __CUDA_API_PTSZ(cuGraphUpload)
+    #define cuGraphLaunch                       __CUDA_API_PTSZ(cuGraphLaunch)
+    #define cuStreamCopyAttributes              __CUDA_API_PTSZ(cuStreamCopyAttributes)
+    #define cuStreamGetAttribute                __CUDA_API_PTSZ(cuStreamGetAttribute)
+    #define cuStreamSetAttribute                __CUDA_API_PTSZ(cuStreamSetAttribute)
+    #define cuMemMapArrayAsync                  __CUDA_API_PTSZ(cuMemMapArrayAsync)
+
+    #define cuMemFreeAsync                      __CUDA_API_PTSZ(cuMemFreeAsync)
+    #define cuMemAllocAsync                     __CUDA_API_PTSZ(cuMemAllocAsync)
+    #define cuMemAllocFromPoolAsync             __CUDA_API_PTSZ(cuMemAllocFromPoolAsync)
+
+    #define cuStreamBeginCaptureToGraph         __CUDA_API_PTSZ(cuStreamBeginCaptureToGraph)
+
+#endif
+
+#define cuMemBatchDecompressAsync               __CUDA_API_PTSZ(cuMemBatchDecompressAsync)
+
+/**
+ * \file cuda.h
+ * \brief Header file for the CUDA Toolkit application programming interface.
+ *
+ * \file cudaGL.h
+ * \brief Header file for the OpenGL interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * \file cudaD3D9.h
+ * \brief Header file for the Direct3D 9 interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ */
+
+/**
+ * \defgroup CUDA_TYPES Data types used by CUDA driver
+ * @{
+ */
+
+/**
+ * CUDA API version number
+ */
+#define CUDA_VERSION 12080
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * CUDA device pointer
+ * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
+ */
+#if defined(_WIN64) || defined(__LP64__)
+typedef unsigned long long CUdeviceptr_v2;
+#else
+typedef unsigned int CUdeviceptr_v2;
+#endif
+typedef CUdeviceptr_v2 CUdeviceptr;                          /**< CUDA device pointer */
+
+typedef int CUdevice_v1;                                     /**< CUDA device */
+typedef CUdevice_v1 CUdevice;                                /**< CUDA device */
+typedef struct CUctx_st *CUcontext;                          /**< A regular context handle */
+typedef struct CUmod_st *CUmodule;                           /**< CUDA module */
+typedef struct CUfunc_st *CUfunction;                        /**< CUDA function */
+typedef struct CUlib_st *CUlibrary;                          /**< CUDA library */
+typedef struct CUkern_st *CUkernel;                          /**< CUDA kernel */
+typedef struct CUarray_st *CUarray;                          /**< CUDA array */
+typedef struct CUmipmappedArray_st *CUmipmappedArray;        /**< CUDA mipmapped array */
+typedef struct CUtexref_st *CUtexref;                        /**< CUDA texture reference */
+typedef struct CUsurfref_st *CUsurfref;                      /**< CUDA surface reference */
+typedef struct CUevent_st *CUevent;                          /**< CUDA event */
+typedef struct CUstream_st *CUstream;                        /**< CUDA stream */
+typedef struct CUgraphicsResource_st *CUgraphicsResource;    /**< CUDA graphics interop resource */
+typedef unsigned long long CUtexObject_v1;                   /**< An opaque value that represents a CUDA texture object */
+typedef CUtexObject_v1 CUtexObject;                          /**< An opaque value that represents a CUDA texture object */
+typedef unsigned long long CUsurfObject_v1;                  /**< An opaque value that represents a CUDA surface object */
+typedef CUsurfObject_v1 CUsurfObject;                        /**< An opaque value that represents a CUDA surface object */ 
+typedef struct CUextMemory_st *CUexternalMemory;             /**< CUDA external memory */
+typedef struct CUextSemaphore_st *CUexternalSemaphore;       /**< CUDA external semaphore */
+typedef struct CUgraph_st *CUgraph;                          /**< CUDA graph */
+typedef struct CUgraphNode_st *CUgraphNode;                  /**< CUDA graph node */
+typedef struct CUgraphExec_st *CUgraphExec;                  /**< CUDA executable graph */
+typedef struct CUmemPoolHandle_st *CUmemoryPool;             /**< CUDA memory pool */
+typedef struct CUuserObject_st *CUuserObject;                /**< CUDA user object for graphs */
+typedef cuuint64_t CUgraphConditionalHandle; /**< CUDA graph conditional handle */
+typedef struct CUgraphDeviceUpdatableNode_st *CUgraphDeviceNode; /**< CUDA graph device node handle */
+typedef struct CUasyncCallbackEntry_st *CUasyncCallbackHandle;            /**< CUDA async notification callback handle */
+/*!
+ * \typedef typedef struct CUgreenCtx_st* CUgreenCtx
+ * A green context handle. This handle can be used safely from only one CPU thread at a time.
+ * Created via ::cuGreenCtxCreate
+ */
+typedef struct CUgreenCtx_st *CUgreenCtx;
+
+#ifndef CU_UUID_HAS_BEEN_DEFINED
+#define CU_UUID_HAS_BEEN_DEFINED
+typedef struct CUuuid_st {                                /**< CUDA definition of UUID */
+    char bytes[16];
+} CUuuid;
+#endif
+
+/**
+ * CUDA IPC handle size
+ */
+#define CU_IPC_HANDLE_SIZE 64
+
+/**
+ * Fabric handle - An opaque handle representing a memory allocation
+ * that can be exported to processes in same or different nodes. For IPC
+ * between processes on different nodes they must be connected via the
+ * NVSwitch fabric.
+ */
+typedef struct CUmemFabricHandle_st {
+    unsigned char data[CU_IPC_HANDLE_SIZE];
+} CUmemFabricHandle_v1;
+typedef CUmemFabricHandle_v1 CUmemFabricHandle;
+
+/**
+ * CUDA IPC event handle
+ */
+typedef struct CUipcEventHandle_st {
+    char reserved[CU_IPC_HANDLE_SIZE];
+} CUipcEventHandle_v1;
+typedef CUipcEventHandle_v1 CUipcEventHandle;
+
+/**
+ * CUDA IPC mem handle
+ */
+typedef struct CUipcMemHandle_st {
+    char reserved[CU_IPC_HANDLE_SIZE];
+} CUipcMemHandle_v1;
+typedef CUipcMemHandle_v1 CUipcMemHandle;
+
+/**
+ * CUDA Ipc Mem Flags
+ */
+typedef enum CUipcMem_flags_enum {
+    CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */
+} CUipcMem_flags;
+
+
+/**
+ * CUDA Mem Attach Flags
+ */
+typedef enum CUmemAttach_flags_enum {
+    CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */
+    CU_MEM_ATTACH_HOST   = 0x2, /**< Memory cannot be accessed by any stream on any device */
+    CU_MEM_ATTACH_SINGLE = 0x4  /**< Memory can only be accessed by a single stream on the associated device */
+} CUmemAttach_flags;
+
+/**
+ * Context creation flags
+ */
+typedef enum CUctx_flags_enum {
+    CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
+    CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
+    CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
+    CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+    CU_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling
+                                         *  \deprecated This flag was deprecated as of CUDA 4.0
+                                         *  and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */
+    CU_CTX_SCHED_MASK          = 0x07,
+    CU_CTX_MAP_HOST            = 0x08, /**< \deprecated This flag was deprecated as of CUDA 11.0 
+                                         *  and it no longer has any effect. All contexts 
+                                         *  as of CUDA 3.2 behave as though the flag is enabled. */
+    CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
+    CU_CTX_COREDUMP_ENABLE     = 0x20, /**< Trigger coredumps from exceptions in this context */
+    CU_CTX_USER_COREDUMP_ENABLE= 0x40, /**< Enable user pipe to trigger coredumps in this context */
+    CU_CTX_SYNC_MEMOPS         = 0x80, /**< Ensure synchronous memory operations on this context will synchronize */
+    CU_CTX_FLAGS_MASK          = 0xFF
+} CUctx_flags;
+
+/**
+ * Event sched flags
+ */
+typedef enum CUevent_sched_flags_enum {
+    CU_EVENT_SCHED_AUTO = 0x00, /**< Automatic scheduling */
+    CU_EVENT_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
+    CU_EVENT_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
+    CU_EVENT_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+} CUevent_sched_flags;
+
+/**
+ * NVCL event scheduling flags
+ */
+typedef enum cl_event_flags_enum {
+    NVCL_EVENT_SCHED_AUTO = 0x00, /**< Automatic scheduling */
+    NVCL_EVENT_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
+    NVCL_EVENT_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
+    NVCL_EVENT_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+} cl_event_flags;
+
+/**
+ * NVCL context scheduling flags
+ */
+typedef enum cl_context_flags_enum {
+    NVCL_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */
+    NVCL_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
+    NVCL_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
+    NVCL_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+} cl_context_flags;
+
+
+/**
+ * Stream creation flags
+ */
+typedef enum CUstream_flags_enum {
+    CU_STREAM_DEFAULT             = 0x0, /**< Default stream flag */
+    CU_STREAM_NON_BLOCKING        = 0x1  /**< Stream does not synchronize with stream 0 (the NULL stream) */
+} CUstream_flags;
+
+/**
+ * Legacy stream handle
+ *
+ * Stream handle that can be passed as a CUstream to use an implicit stream
+ * with legacy synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define CU_STREAM_LEGACY     ((CUstream)0x1)
+
+/**
+ * Per-thread stream handle
+ *
+ * Stream handle that can be passed as a CUstream to use an implicit stream
+ * with per-thread synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define CU_STREAM_PER_THREAD ((CUstream)0x2)
+
+/**
+ * Event creation flags
+ */
+typedef enum CUevent_flags_enum {
+    CU_EVENT_DEFAULT        = 0x0, /**< Default event flag */
+    CU_EVENT_BLOCKING_SYNC  = 0x1, /**< Event uses blocking synchronization */
+    CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */
+    CU_EVENT_INTERPROCESS   = 0x4  /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
+} CUevent_flags;
+
+/**
+ * Event record flags
+ */
+typedef enum CUevent_record_flags_enum {
+    CU_EVENT_RECORD_DEFAULT  = 0x0, /**< Default event record flag */
+    CU_EVENT_RECORD_EXTERNAL = 0x1  /**< When using stream capture, create an event record node
+                                      *  instead of the default behavior.  This flag is invalid
+                                      *  when used outside of capture. */
+} CUevent_record_flags;
+
+/**
+ * Event wait flags
+ */
+typedef enum CUevent_wait_flags_enum {
+    CU_EVENT_WAIT_DEFAULT  = 0x0, /**< Default event wait flag */
+    CU_EVENT_WAIT_EXTERNAL = 0x1  /**< When using stream capture, create an event wait node
+                                    *  instead of the default behavior.  This flag is invalid
+                                    *  when used outside of capture.*/
+} CUevent_wait_flags;
+
+/**
+ * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64
+ */
+typedef enum CUstreamWaitValue_flags_enum {
+    CU_STREAM_WAIT_VALUE_GEQ   = 0x0,   /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit
+                                             values). Note this is a cyclic comparison which ignores wraparound.
+                                             (Default behavior.) */
+    CU_STREAM_WAIT_VALUE_EQ    = 0x1,   /**< Wait until *addr == value. */
+    CU_STREAM_WAIT_VALUE_AND   = 0x2,   /**< Wait until (*addr & value) != 0. */
+    CU_STREAM_WAIT_VALUE_NOR   = 0x3,   /**< Wait until ~(*addr | value) != 0. Support for this operation can be
+                                             queried with ::cuDeviceGetAttribute() and
+                                             ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/
+    CU_STREAM_WAIT_VALUE_FLUSH = 1<<30  /**< Follow the wait operation with a flush of outstanding remote writes. This
+                                             means that, if a remote write operation is guaranteed to have reached the
+                                             device before the wait can be satisfied, that write is guaranteed to be
+                                             visible to downstream device work. The device is permitted to reorder
+                                             remote writes internally. For example, this flag would be required if
+                                             two remote writes arrive in a defined order, the wait is satisfied by the
+                                             second write, and downstream work needs to observe the first write.
+                                             Support for this operation is restricted to selected platforms and can be
+                                             queried with ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.*/
+} CUstreamWaitValue_flags;
+
+/**
+ * Flags for ::cuStreamWriteValue32
+ */
+typedef enum CUstreamWriteValue_flags_enum {
+    CU_STREAM_WRITE_VALUE_DEFAULT           = 0x0, /**< Default behavior */
+    CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1  /**< Permits the write to be reordered with writes which were issued
+                                                        before it, as a performance optimization. Normally,
+                                                        ::cuStreamWriteValue32 will provide a memory fence before the
+                                                        write, which has similar semantics to
+                                                        __threadfence_system() but is scoped to the stream
+                                                        rather than a CUDA thread.
+                                                        This flag is not supported in the v2 API. */
+} CUstreamWriteValue_flags;
+
+/**
+ * Operations for ::cuStreamBatchMemOp
+ */
+typedef enum CUstreamBatchMemOpType_enum {
+    CU_STREAM_MEM_OP_WAIT_VALUE_32  = 1,     /**< Represents a ::cuStreamWaitValue32 operation */
+    CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2,     /**< Represents a ::cuStreamWriteValue32 operation */
+    CU_STREAM_MEM_OP_WAIT_VALUE_64  = 4,     /**< Represents a ::cuStreamWaitValue64 operation */
+    CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5,     /**< Represents a ::cuStreamWriteValue64 operation */
+    CU_STREAM_MEM_OP_BARRIER = 6,            /**< Insert a memory barrier of the specified type */ 
+    CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a
+                                                  standalone operation. */
+} CUstreamBatchMemOpType;
+
+/**
+ * Flags for ::cuStreamMemoryBarrier
+ */
+typedef enum CUstreamMemoryBarrier_flags_enum {
+    CU_STREAM_MEMORY_BARRIER_TYPE_SYS = 0x0, /**< System-wide memory barrier. */
+    CU_STREAM_MEMORY_BARRIER_TYPE_GPU = 0x1 /**< Limit memory barrier scope to the GPU. */
+} CUstreamMemoryBarrier_flags;
+
+/**
+ * Per-operation parameters for ::cuStreamBatchMemOp
+ */
+typedef union CUstreamBatchMemOpParams_union {
+    CUstreamBatchMemOpType operation;
+    struct CUstreamMemOpWaitValueParams_st {
+        CUstreamBatchMemOpType operation;
+        CUdeviceptr address;
+        union {
+            cuuint32_t value;
+            cuuint64_t value64;
+        };
+        unsigned int flags;
+        CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
+    } waitValue;
+    struct CUstreamMemOpWriteValueParams_st {
+        CUstreamBatchMemOpType operation;
+        CUdeviceptr address;
+        union {
+            cuuint32_t value;
+            cuuint64_t value64;
+        };
+        unsigned int flags;
+        CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
+    } writeValue;
+    struct CUstreamMemOpFlushRemoteWritesParams_st {
+        CUstreamBatchMemOpType operation;
+        unsigned int flags;
+    } flushRemoteWrites;
+    struct CUstreamMemOpMemoryBarrierParams_st { /**< Only supported in the _v2 API */
+        CUstreamBatchMemOpType operation;
+        unsigned int flags;
+    } memoryBarrier;
+    cuuint64_t pad[6];
+} CUstreamBatchMemOpParams_v1;
+typedef CUstreamBatchMemOpParams_v1 CUstreamBatchMemOpParams;
+
+typedef struct CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st {
+    CUcontext ctx;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} CUDA_BATCH_MEM_OP_NODE_PARAMS_v1;
+typedef CUDA_BATCH_MEM_OP_NODE_PARAMS_v1 CUDA_BATCH_MEM_OP_NODE_PARAMS;
+
+/**
+ * Batch memory operation node parameters
+ */
+typedef struct CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st {
+    CUcontext ctx;                        /**< Context to use for the operations. */
+    unsigned int count;                   /**< Number of operations in paramArray. */
+    CUstreamBatchMemOpParams *paramArray; /**< Array of batch memory operations. */
+    unsigned int flags;                   /**< Flags to control the node. */
+} CUDA_BATCH_MEM_OP_NODE_PARAMS_v2;
+
+/**
+ * Occupancy calculator flag
+ */
+typedef enum CUoccupancy_flags_enum {
+    CU_OCCUPANCY_DEFAULT                  = 0x0, /**< Default behavior */
+    CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1  /**< Assume global caching is enabled and cannot be automatically turned off */
+} CUoccupancy_flags;
+
+/**
+ * Flags for ::cuStreamUpdateCaptureDependencies
+ */
+typedef enum CUstreamUpdateCaptureDependencies_flags_enum {
+    CU_STREAM_ADD_CAPTURE_DEPENDENCIES = 0x0, /**< Add new nodes to the dependency set */
+    CU_STREAM_SET_CAPTURE_DEPENDENCIES = 0x1  /**< Replace the dependency set with the new nodes */
+} CUstreamUpdateCaptureDependencies_flags;
+
+/**
+* Types of async notification that can be sent
+*/
+typedef enum CUasyncNotificationType_enum {
+    CU_ASYNC_NOTIFICATION_TYPE_OVER_BUDGET = 0x1
+} CUasyncNotificationType;
+
+/**
+* Information passed to the user via the async notification callback
+*/
+typedef struct CUasyncNotificationInfo_st {
+    CUasyncNotificationType type;
+    union {
+        struct {
+            unsigned long long bytesOverBudget;
+        } overBudget;
+    } info;
+} CUasyncNotificationInfo;
+
+/**
+ * CUDA async notification callback
+ * \param info Information describing what actions to take as a result of this trim notification.
+ * \param userData Pointer to user defined data provided at registration.
+ * \param callback The callback handle associated with this specific callback.
+ */
+typedef void (*CUasyncCallback)(CUasyncNotificationInfo *info, void *userData, CUasyncCallbackHandle callback);
+
+/**
+ * Array formats
+ */
+typedef enum CUarray_format_enum {
+    CU_AD_FORMAT_UNSIGNED_INT8            = 0x01, /**< Unsigned 8-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT16           = 0x02, /**< Unsigned 16-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT32           = 0x03, /**< Unsigned 32-bit integers */
+    CU_AD_FORMAT_SIGNED_INT8              = 0x08, /**< Signed 8-bit integers */
+    CU_AD_FORMAT_SIGNED_INT16             = 0x09, /**< Signed 16-bit integers */
+    CU_AD_FORMAT_SIGNED_INT32             = 0x0a, /**< Signed 32-bit integers */
+    CU_AD_FORMAT_HALF                     = 0x10, /**< 16-bit floating point */
+    CU_AD_FORMAT_FLOAT                    = 0x20, /**< 32-bit floating point */
+    CU_AD_FORMAT_NV12                     = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */
+    CU_AD_FORMAT_UNORM_INT8X1             = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT8X2             = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT8X4             = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT16X1            = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT16X2            = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT16X4            = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT8X1             = 0xc6, /**< 1 channel signed 8-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT8X2             = 0xc7, /**< 2 channel signed 8-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT8X4             = 0xc8, /**< 4 channel signed 8-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT16X1            = 0xc9, /**< 1 channel signed 16-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT16X2            = 0xca, /**< 2 channel signed 16-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT16X4            = 0xcb, /**< 4 channel signed 16-bit normalized integer */
+    CU_AD_FORMAT_BC1_UNORM                = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */
+    CU_AD_FORMAT_BC1_UNORM_SRGB           = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/
+    CU_AD_FORMAT_BC2_UNORM                = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */
+    CU_AD_FORMAT_BC2_UNORM_SRGB           = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/
+    CU_AD_FORMAT_BC3_UNORM                = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */
+    CU_AD_FORMAT_BC3_UNORM_SRGB           = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/
+    CU_AD_FORMAT_BC4_UNORM                = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */
+    CU_AD_FORMAT_BC4_SNORM                = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */
+    CU_AD_FORMAT_BC5_UNORM                = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */
+    CU_AD_FORMAT_BC5_SNORM                = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */
+    CU_AD_FORMAT_BC6H_UF16                = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */
+    CU_AD_FORMAT_BC6H_SF16                = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */
+    CU_AD_FORMAT_BC7_UNORM                = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */
+    CU_AD_FORMAT_BC7_UNORM_SRGB           = 0x9e, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */
+    CU_AD_FORMAT_P010                     = 0x9f, /**< 10-bit YUV planar format, with 4:2:0 sampling */
+    CU_AD_FORMAT_P016                     = 0xa1, /**< 16-bit YUV planar format, with 4:2:0 sampling */
+    CU_AD_FORMAT_NV16                     = 0xa2, /**< 8-bit YUV planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_P210                     = 0xa3, /**< 10-bit YUV planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_P216                     = 0xa4, /**< 16-bit YUV planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_YUY2                     = 0xa5, /**< 2 channel, 8-bit YUV packed planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_Y210                     = 0xa6, /**< 2 channel, 10-bit YUV packed planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_Y216                     = 0xa7, /**< 2 channel, 16-bit YUV packed planar format, with 4:2:2 sampling */
+    CU_AD_FORMAT_AYUV                     = 0xa8, /**< 4 channel, 8-bit YUV packed planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_Y410                     = 0xa9, /**< 10-bit YUV packed planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_Y416                     = 0xb1, /**< 4 channel, 12-bit YUV packed planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_Y444_PLANAR8             = 0xb2, /**< 3 channel 8-bit YUV planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_Y444_PLANAR10            = 0xb3, /**< 3 channel 10-bit YUV planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_YUV444_8bit_SemiPlanar   = 0xb4, /**< 3 channel 8-bit YUV semi-planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_YUV444_16bit_SemiPlanar  = 0xb5, /**< 3 channel 16-bit YUV semi-planar format, with 4:4:4 sampling */
+    CU_AD_FORMAT_UNORM_INT_101010_2       = 0x50, /**< 4 channel unorm R10G10B10A2 RGB format */
+    CU_AD_FORMAT_MAX                      = 0x7FFFFFFF
+} CUarray_format;
+
+/**
+ * Texture reference addressing modes
+ */
+typedef enum CUaddress_mode_enum {
+    CU_TR_ADDRESS_MODE_WRAP   = 0, /**< Wrapping address mode */
+    CU_TR_ADDRESS_MODE_CLAMP  = 1, /**< Clamp to edge address mode */
+    CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
+    CU_TR_ADDRESS_MODE_BORDER = 3  /**< Border address mode */
+} CUaddress_mode;
+
+/**
+ * Texture reference filtering modes
+ */
+typedef enum CUfilter_mode_enum {
+    CU_TR_FILTER_MODE_POINT  = 0, /**< Point filter mode */
+    CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
+} CUfilter_mode;
+
+/**
+ * Device properties
+ */
+typedef enum CUdevice_attribute_enum {
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,                          /**< Maximum number of threads per block */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                                /**< Maximum block dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                                /**< Maximum block dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                                /**< Maximum block dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                                 /**< Maximum grid dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                                 /**< Maximum grid dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                                 /**< Maximum grid dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,                    /**< Maximum shared memory available per block in bytes */
+    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,                        /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,                          /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                                     /**< Warp size in threads */
+    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                                     /**< Maximum pitch in bytes allowed by memory copies */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,                       /**< Maximum number of 32-bit registers available per block */
+    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,                           /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                                    /**< Typical clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                             /**< Alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                                   /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
+    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,                          /**< Number of multiprocessors on device */
+    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,                           /**< Specifies whether there is a run time limit on kernels */
+    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                                    /**< Device is integrated with host memory */
+    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,                           /**< Device can map host memory into CUDA address space */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                                  /**< Compute mode (See ::CUcomputemode for details) */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,                       /**< Maximum 1D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,                       /**< Maximum 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,                      /**< Maximum 2D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,                       /**< Maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,                      /**< Maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,                       /**< Maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,               /**< Maximum 2D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,              /**< Maximum 2D layered texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,              /**< Maximum layers in a 2D layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,                 /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,                /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29,             /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
+    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                             /**< Alignment requirement for surfaces */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                            /**< Device can possibly execute multiple kernels concurrently */
+    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                                   /**< Device has ECC support enabled */
+    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                                    /**< PCI bus ID of the device */
+    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                                 /**< PCI device ID of the device */
+    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,                                    /**< Device is using TCC driver model */
+    CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                             /**< Peak memory clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,                       /**< Global memory bus width in bits */
+    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                                 /**< Size of L2 cache in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,                /**< Maximum resident threads per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                            /**< Number of asynchronous engines */
+    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                            /**< Device shares a unified address space with the host */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,               /**< Maximum 1D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,              /**< Maximum layers in a 1D layered texture */
+    CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,                              /**< Deprecated, do not use. */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,                /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,               /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47,             /**< Alternate maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,            /**< Alternate maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49,             /**< Alternate maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,                                 /**< PCI domain ID of the device */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,                       /**< Pitch alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,                  /**< Maximum cubemap texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,          /**< Maximum cubemap layered texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,         /**< Maximum layers in a cubemap layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,                       /**< Maximum 1D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,                       /**< Maximum 2D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,                      /**< Maximum 2D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,                       /**< Maximum 3D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,                      /**< Maximum 3D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,                       /**< Maximum 3D surface depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,               /**< Maximum 1D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,              /**< Maximum layers in a 1D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,               /**< Maximum 2D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,              /**< Maximum 2D layered surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,              /**< Maximum layers in a 2D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,                  /**< Maximum cubemap surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,          /**< Maximum cubemap layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,         /**< Maximum layers in a cubemap layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,                /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,                /**< Maximum 2D linear texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,               /**< Maximum 2D linear texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,                /**< Maximum 2D linear texture pitch in bytes */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73,             /**< Maximum mipmapped 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,            /**< Maximum mipmapped 2D texture height */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,                      /**< Major compute capability version number */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,                      /**< Minor compute capability version number */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77,             /**< Maximum mipmapped 1D texture width */
+    CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,                   /**< Device supports stream priorities */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,                     /**< Device supports caching globals in L1 */
+    CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,                      /**< Device supports caching locals in L1 */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,          /**< Maximum shared memory available per multiprocessor in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,              /**< Maximum number of 32-bit registers available per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,                                /**< Device can allocate managed memory on this system */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,                               /**< Device is on a multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,                      /**< Unique id for a group of devices on the same multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,                  /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
+    CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,         /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,                        /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,                     /**< Device can coherently access managed memory concurrently with the CPU */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,                  /**< Device supports compute preemption. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,       /**< Device can access host registered memory at the same virtual address as the CPU */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS_V1 = 92,                     /**< Deprecated, along with v1 MemOps API, ::cuStreamBatchMemOp and related APIs are supported. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V1 = 93,              /**< Deprecated, along with v1 MemOps API, 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V1 = 94,              /**< Deprecated, along with v1 MemOps API, ::CU_STREAM_WAIT_VALUE_NOR is supported. */
+    CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,                            /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
+    CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,               /**< Deprecated, ::cuLaunchCooperativeKernelMultiDevice is deprecated. */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97,             /**< Maximum optin shared memory per block */
+    CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,                       /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
+    CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,                       /**< Device supports host memory registration via ::cudaHostRegister. */
+    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */
+    CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101,          /**< The host can directly access managed memory on the device without migration. */
+    CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102,         /**< Deprecated, Use CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED*/
+    CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102,         /**< Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103,  /**< Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104,           /**< Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105,       /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106,                /**< Maximum number of blocks per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107,                /**< Device supports compression of memory */
+    CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108,                 /**< Maximum L2 persisting lines capacity setting in bytes. */
+    CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109,                /**< Maximum value of CUaccessPolicyWindow::num_bytes. */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110,      /**< Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111,             /**< Shared memory reserved by CUDA driver per block in bytes */
+    CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112,                  /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
+    CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113,            /**< Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */
+    CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114,         /**< External timeline semaphore interop is supported on the device */
+    CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115,                       /**< Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116,                    /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117,         /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118,              /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */
+    CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119,               /**< Handle types supported with mempool based IPC */
+    CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 120,                               /**< Indicates device supports cluster launch */
+    CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121,        /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 122,                /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related MemOp APIs. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 123,                /**< ::CU_STREAM_WAIT_VALUE_NOR is supported by MemOp APIs. */
+    CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED = 124,                            /**< Device supports buffer sharing with dma_buf mechanism. */ 
+    CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED = 125,                          /**< Device supports IPC Events. */ 
+    CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT = 126,                        /**< Number of memory domains the device supports. */
+    CU_DEVICE_ATTRIBUTE_TENSOR_MAP_ACCESS_SUPPORTED = 127,                  /**< Device supports accessing memory using Tensor Map. */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED = 128,                 /**< Device supports exporting memory to a fabric handle with cuMemExportToShareableHandle() or requested with cuMemCreate() */
+    CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS = 129,                    /**< Device supports unified function pointers. */
+    CU_DEVICE_ATTRIBUTE_NUMA_CONFIG = 130,                                  /**< NUMA configuration of a device: value is of type ::CUdeviceNumaConfig enum */
+    CU_DEVICE_ATTRIBUTE_NUMA_ID = 131,                                      /**< NUMA node ID of the GPU memory */
+    CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED = 132,                          /**< Device supports switch multicast and reduction operations. */
+    CU_DEVICE_ATTRIBUTE_MPS_ENABLED = 133,                                  /**< Indicates if contexts created on this device will be shared via MPS */
+    CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID = 134,                                 /**< NUMA ID of the host node closest to the device. Returns -1 when system does not support NUMA. */
+    CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED = 135,                          /**< Device supports CIG with D3D12. */
+    CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK = 136,                /**< The returned valued shall be interpreted as a bitmask, where the individual bits are described by the ::CUmemDecompressAlgorithm enum. */
+    CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_MAXIMUM_LENGTH = 137,                /**< The returned valued is the maximum length in bytes of a single decompress operation that is allowed. */
+    CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID    = 139, /**< The combined 16-bit PCI device ID and 16-bit PCI vendor ID. */
+    CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID = 140, /**< The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID. */
+    CU_DEVICE_ATTRIBUTE_HOST_NUMA_MULTINODE_IPC_SUPPORTED = 143,             /**< Device supports HOST_NUMA location IPC between nodes in a multi-node system. */
+    CU_DEVICE_ATTRIBUTE_MAX
+} CUdevice_attribute;
+
+/**
+ * Legacy device properties
+ */
+typedef struct CUdevprop_st {
+    int maxThreadsPerBlock;     /**< Maximum number of threads per block */
+    int maxThreadsDim[3];       /**< Maximum size of each dimension of a block */
+    int maxGridSize[3];         /**< Maximum size of each dimension of a grid */
+    int sharedMemPerBlock;      /**< Shared memory available per block in bytes */
+    int totalConstantMemory;    /**< Constant memory available on device in bytes */
+    int SIMDWidth;              /**< Warp size in threads */
+    int memPitch;               /**< Maximum pitch in bytes allowed by memory copies */
+    int regsPerBlock;           /**< 32-bit registers available per block */
+    int clockRate;              /**< Clock frequency in kilohertz */
+    int textureAlign;           /**< Alignment requirement for textures */
+} CUdevprop_v1;
+typedef CUdevprop_v1 CUdevprop;
+
+/**
+ * Pointer information
+ */
+typedef enum CUpointer_attribute_enum {
+    CU_POINTER_ATTRIBUTE_CONTEXT = 1,                     /**< The ::CUcontext on which a pointer was allocated or registered */
+    CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2,                 /**< The ::CUmemorytype describing the physical location of a pointer */
+    CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3,              /**< The address at which a pointer's memory may be accessed on the device */
+    CU_POINTER_ATTRIBUTE_HOST_POINTER = 4,                /**< The address at which a pointer's memory may be accessed on the host */
+    CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5,                  /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */
+    CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6,                 /**< Synchronize every synchronous memory operation initiated on this region */
+    CU_POINTER_ATTRIBUTE_BUFFER_ID = 7,                   /**< A process-wide unique ID for an allocated memory region*/
+    CU_POINTER_ATTRIBUTE_IS_MANAGED = 8,                  /**< Indicates if the pointer points to managed memory */
+    CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9,              /**< A device ordinal of a device on which a pointer was allocated or registered */
+    CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10, /**< 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise **/
+    CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11,           /**< Starting address for this requested pointer */
+    CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12,                 /**< Size of the address range for this requested pointer */
+    CU_POINTER_ATTRIBUTE_MAPPED = 13,                     /**< 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise **/
+    CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14,       /**< Bitmask of allowed ::CUmemAllocationHandleType for this allocation **/
+    CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15, /**< 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API **/
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16,               /**< Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given */
+    CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17,             /**< Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. **/
+    CU_POINTER_ATTRIBUTE_MAPPING_SIZE = 18,               /**< Size of the actual underlying mapping that the pointer belongs to **/
+    CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR = 19,          /**< The start address of the mapping that the pointer belongs to **/
+    CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID = 20             /**< A process-wide unique id corresponding to the physical allocation the pointer belongs to **/
+  , CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE = 21    /**< Returns in \p *data a boolean that indicates whether the pointer points to memory that is capable to be used for hardware accelerated decompression. */
+} CUpointer_attribute;
+
+/**
+ * Function properties
+ */
+typedef enum CUfunction_attribute_enum {
+    /**
+     * The maximum number of threads per block, beyond which a launch of the
+     * function would fail. This number depends on both the function and the
+     * device on which the function is currently loaded.
+     */
+    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+
+    /**
+     * The size in bytes of statically-allocated shared memory required by
+     * this function. This does not include dynamically-allocated shared
+     * memory requested by the user at runtime.
+     */
+    CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
+
+    /**
+     * The size in bytes of user-allocated constant memory required by this
+     * function.
+     */
+    CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
+
+    /**
+     * The size in bytes of local memory used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
+
+    /**
+     * The number of registers used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
+
+    /**
+     * The PTX virtual architecture version for which the function was
+     * compiled. This value is the major PTX version * 10 + the minor PTX
+     * version, so a PTX version 1.3 function would return the value 13.
+     * Note that this may return the undefined value of 0 for cubins
+     * compiled prior to CUDA 3.0.
+     */
+    CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
+
+    /**
+     * The binary architecture version for which the function was compiled.
+     * This value is the major binary version * 10 + the minor binary version,
+     * so a binary version 1.3 function would return the value 13. Note that
+     * this will return a value of 10 for legacy cubins that do not have a
+     * properly-encoded binary architecture version.
+     */
+    CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
+
+    /**
+     * The attribute to indicate whether the function has been compiled with
+     * user specified option "-Xptxas --dlcm=ca" set .
+     */
+    CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
+
+    /**
+     * The maximum size in bytes of dynamically-allocated shared memory that can be used by
+     * this function. If the user-specified dynamic shared memory size is larger than this
+     * value, the launch will fail.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
+
+    /**
+     * On devices where the L1 cache and shared memory use the same hardware resources, 
+     * this sets the shared memory carveout preference, in percent of the total shared memory.
+     * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
+     * This is only a hint, and the driver can choose a different ratio if required to execute the function.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
+
+    /**
+     * If this attribute is set, the kernel must launch with a valid cluster
+     * size specified.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET = 10,
+
+    /**
+     * The required cluster width in blocks. The values must either all be 0 or
+     * all be positive. The validity of the cluster dimensions is otherwise
+     * checked at launch time.
+     *
+     * If the value is set during compile time, it cannot be set at runtime.
+     * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH = 11,
+
+    /**
+     * The required cluster height in blocks. The values must either all be 0 or
+     * all be positive. The validity of the cluster dimensions is otherwise
+     * checked at launch time.
+     *
+     * If the value is set during compile time, it cannot be set at runtime.
+     * Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT = 12,
+
+    /**
+     * The required cluster depth in blocks. The values must either all be 0 or
+     * all be positive. The validity of the cluster dimensions is otherwise
+     * checked at launch time.
+     *
+     * If the value is set during compile time, it cannot be set at runtime.
+     * Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH = 13,
+
+    /**
+     * Whether the function can be launched with non-portable cluster size. 1 is
+     * allowed, 0 is disallowed. A non-portable cluster size may only function
+     * on the specific SKUs the program is tested on. The launch might fail if
+     * the program is run on a different hardware platform.
+     *
+     * CUDA API provides cudaOccupancyMaxActiveClusters to assist with checking
+     * whether the desired size can be launched on the current device.
+     *
+     * Portable Cluster Size
+     *
+     * A portable cluster size is guaranteed to be functional on all compute
+     * capabilities higher than the target compute capability. The portable
+     * cluster size for sm_90 is 8 blocks per cluster. This value may increase
+     * for future compute capabilities.
+     *
+     * The specific hardware unit may support higher cluster sizes that’s not
+     * guaranteed to be portable.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED = 14,
+
+    /**
+     * The block scheduling policy of a function. The value type is
+     * CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy.
+     * See ::cuFuncSetAttribute, ::cuKernelSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 15,
+
+    CU_FUNC_ATTRIBUTE_MAX
+} CUfunction_attribute;
+
+/**
+ * Function cache configurations
+ */
+typedef enum CUfunc_cache_enum {
+    CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
+    CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
+    CU_FUNC_CACHE_PREFER_L1      = 0x02, /**< prefer larger L1 cache and smaller shared memory */
+    CU_FUNC_CACHE_PREFER_EQUAL   = 0x03  /**< prefer equal sized L1 cache and shared memory */
+} CUfunc_cache;
+
+/**
+ * \deprecated
+ *
+ * Shared memory configurations
+ */
+typedef enum CUsharedconfig_enum {
+    CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00, /**< set default shared memory bank size */
+    CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01, /**< set shared memory bank width to four bytes */
+    CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02  /**< set shared memory bank width to eight bytes */
+} CUsharedconfig;
+
+/**
+ * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute or ::cuKernelSetAttribute
+ */
+typedef enum CUshared_carveout_enum {
+    CU_SHAREDMEM_CARVEOUT_DEFAULT       = -1,  /**< No preference for shared memory or L1 (default) */
+    CU_SHAREDMEM_CARVEOUT_MAX_SHARED    = 100, /**< Prefer maximum available shared memory, minimum L1 cache */
+    CU_SHAREDMEM_CARVEOUT_MAX_L1        = 0    /**< Prefer maximum available L1 cache, minimum shared memory */
+} CUshared_carveout;
+
+/**
+ * Memory types
+ */
+typedef enum CUmemorytype_enum {
+    CU_MEMORYTYPE_HOST    = 0x01,    /**< Host memory */
+    CU_MEMORYTYPE_DEVICE  = 0x02,    /**< Device memory */
+    CU_MEMORYTYPE_ARRAY   = 0x03,    /**< Array memory */
+    CU_MEMORYTYPE_UNIFIED = 0x04     /**< Unified device or host memory */
+} CUmemorytype;
+
+/**
+ * Compute Modes
+ */
+typedef enum CUcomputemode_enum {
+    CU_COMPUTEMODE_DEFAULT           = 0, /**< Default compute mode (Multiple contexts allowed per device) */
+    CU_COMPUTEMODE_PROHIBITED        = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
+    CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3  /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
+} CUcomputemode;
+
+/**
+ * Memory advise values
+ */
+typedef enum CUmem_advise_enum {
+    CU_MEM_ADVISE_SET_READ_MOSTLY          = 1, /**< Data will mostly be read and only occasionally be written to */
+    CU_MEM_ADVISE_UNSET_READ_MOSTLY        = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */
+    CU_MEM_ADVISE_SET_PREFERRED_LOCATION   = 3, /**< Set the preferred location for the data as the specified device */
+    CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */
+    CU_MEM_ADVISE_SET_ACCESSED_BY          = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */
+    CU_MEM_ADVISE_UNSET_ACCESSED_BY        = 6  /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */
+} CUmem_advise;
+
+typedef enum CUmem_range_attribute_enum {
+    CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY                 = 1, /**< Whether the range will mostly be read and only occasionally be written to */
+    CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION          = 2, /**< The preferred location of the range */
+    CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY                 = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */
+    CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION      = 4  /**< The last location to which the range was prefetched */
+    , CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE     = 5 /**< The preferred location type of the range */
+    , CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID       = 6 /**< The preferred location id of the range */
+    , CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE = 7 /**< The last location type to which the range was prefetched */
+    , CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID   = 8 /**< The last location id to which the range was prefetched */
+} CUmem_range_attribute;
+
+/**
+ * Online compiler and linker options
+ */
+typedef enum CUjit_option_enum
+{
+    /**
+     * Max number of registers that a thread may use.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_MAX_REGISTERS = 0,
+
+    /**
+     * IN: Specifies minimum number of threads per block to target compilation
+     * for\n
+     * OUT: Returns the number of threads the compiler actually targeted.
+     * This restricts the resource utilization of the compiler (e.g. max
+     * registers) such that a block with the given number of threads should be
+     * able to launch based on register limitations. Note, this option does not
+     * currently take into account any other resource limitations, such as
+     * shared memory utilization.\n
+     * Cannot be combined with ::CU_JIT_TARGET.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_THREADS_PER_BLOCK = 1,
+
+    /**
+     * Overwrites the option value with the total wall clock time, in
+     * milliseconds, spent in the compiler and linker\n
+     * Option type: float\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_WALL_TIME = 2,
+
+    /**
+     * Pointer to a buffer in which to print any log messages
+     * that are informational in nature (the buffer size is specified via
+     * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_INFO_LOG_BUFFER = 3,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4,
+
+    /**
+     * Pointer to a buffer in which to print any log messages that
+     * reflect errors (the buffer size is specified via option
+     * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_ERROR_LOG_BUFFER = 5,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6,
+
+    /**
+     * Level of optimizations to apply to generated code (0 - 4), with 4
+     * being the default and highest level of optimizations.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_OPTIMIZATION_LEVEL = 7,
+
+    /**
+     * No option value required. Determines the target based on the current
+     * attached context (default)\n
+     * Option type: No option value needed\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_TARGET_FROM_CUCONTEXT = 8,
+
+    /**
+     * Target is chosen based on supplied ::CUjit_target.  Cannot be
+     * combined with ::CU_JIT_THREADS_PER_BLOCK.\n
+     * Option type: unsigned int for enumerated type ::CUjit_target\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_TARGET = 9,
+
+    /**
+     * Specifies choice of fallback strategy if matching cubin is not found.
+     * Choice is based on supplied ::CUjit_fallback.  This option cannot be
+     * used with cuLink* APIs as the linker requires exact matches.\n
+     * Option type: unsigned int for enumerated type ::CUjit_fallback\n
+     * Applies to: compiler only
+     */
+    CU_JIT_FALLBACK_STRATEGY = 10,
+
+    /**
+     * Specifies whether to create debug information in output (-g)
+     * (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_GENERATE_DEBUG_INFO = 11,
+
+    /**
+     * Generate verbose log messages (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_LOG_VERBOSE = 12,
+
+    /**
+     * Generate line number information (-lineinfo) (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_GENERATE_LINE_INFO = 13,
+
+    /**
+     * Specifies whether to enable caching explicitly (-dlcm) \n
+     * Choice is based on supplied ::CUjit_cacheMode_enum.\n
+     * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n
+     * Applies to: compiler only
+     */
+    CU_JIT_CACHE_MODE = 14,
+
+    /**
+     * \deprecated
+     * This jit option is deprecated and should not be used.
+     */
+    CU_JIT_NEW_SM3X_OPT = 15,
+
+    /**
+     * This jit option is used for internal purpose only.
+     */
+    CU_JIT_FAST_COMPILE = 16,
+
+    /**
+     * Array of device symbol names that will be relocated to the corresponding
+     * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n
+     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+     * When loading a device module, driver will relocate all encountered
+     * unresolved symbols to the host addresses.\n
+     * It is only allowed to register symbols that correspond to unresolved
+     * global variables.\n
+     * It is illegal to register the same device symbol at multiple addresses.\n
+     * Option type: const char **\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_NAMES = 17,
+
+    /**
+     * Array of host addresses that will be used to relocate corresponding
+     * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n
+     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+     * Option type: void **\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_ADDRESSES = 18,
+
+    /**
+     * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and
+     * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n
+     * Option type: unsigned int\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_COUNT = 19,
+
+    /**
+     * \deprecated
+     * Enable link-time optimization (-dlto) for device code (Disabled by default).\n
+     * This option is not supported on 32-bit platforms.\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_LTO = 20,
+
+    /**
+     * \deprecated
+     * Control single-precision denormals (-ftz) support (0: false, default).
+     * 1 : flushes denormal values to zero
+     * 0 : preserves denormal values
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_FTZ = 21,
+
+    /**
+     * \deprecated
+     * Control single-precision floating-point division and reciprocals
+     * (-prec-div) support (1: true, default).
+     * 1 : Enables the IEEE round-to-nearest mode
+     * 0 : Enables the fast approximation mode
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_PREC_DIV = 22,
+
+    /**
+     * \deprecated
+     * Control single-precision floating-point square root
+     * (-prec-sqrt) support (1: true, default).
+     * 1 : Enables the IEEE round-to-nearest mode
+     * 0 : Enables the fast approximation mode
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_PREC_SQRT = 23,
+
+    /**
+     * \deprecated
+     * Enable/Disable the contraction of floating-point multiplies
+     * and adds/subtracts into floating-point multiply-add (-fma)
+     * operations (1: Enable, default; 0: Disable).
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_FMA = 24,
+
+    /**
+     * \deprecated
+     * Array of kernel names that should be preserved at link time while others
+     * can be removed.\n
+     * Must contain ::CU_JIT_REFERENCED_KERNEL_COUNT entries.\n
+     * Note that kernel names can be mangled by the compiler in which case the
+     * mangled name needs to be specified.\n
+     * Wildcard "*" can be used to represent zero or more characters instead of
+     * specifying the full or mangled name.\n
+     * It is important to note that the wildcard "*" is also added implicitly.
+     * For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and
+     * thus preserve all kernels with those names. This can be avoided by providing
+     * a more specific name like "barfoobaz".\n
+     * Option type: const char **\n
+     * Applies to: dynamic linker only
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_REFERENCED_KERNEL_NAMES = 25,
+
+    /**
+     * \deprecated
+     * Number of entries in ::CU_JIT_REFERENCED_KERNEL_NAMES array.\n
+     * Option type: unsigned int\n
+     * Applies to: dynamic linker only
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_REFERENCED_KERNEL_COUNT = 26,
+
+    /**
+     * \deprecated
+     * Array of variable names (__device__ and/or __constant__) that should be
+     * preserved at link time while others can be removed.\n
+     * Must contain ::CU_JIT_REFERENCED_VARIABLE_COUNT entries.\n
+     * Note that variable names can be mangled by the compiler in which case the
+     * mangled name needs to be specified.\n
+     * Wildcard "*" can be used to represent zero or more characters instead of
+     * specifying the full or mangled name.\n
+     * It is important to note that the wildcard "*" is also added implicitly.
+     * For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and
+     * thus preserve all variables with those names. This can be avoided by providing
+     * a more specific name like "barfoobaz".\n
+     * Option type: const char **\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_REFERENCED_VARIABLE_NAMES = 27,
+
+    /**
+     * \deprecated
+     * Number of entries in ::CU_JIT_REFERENCED_VARIABLE_NAMES array.\n
+     * Option type: unsigned int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_REFERENCED_VARIABLE_COUNT = 28,
+
+    /**
+     * \deprecated
+     * This option serves as a hint to enable the JIT compiler/linker
+     * to remove constant (__constant__) and device (__device__) variables
+     * unreferenced in device code (Disabled by default).\n
+     * Note that host references to constant and device variables using APIs like
+     * ::cuModuleGetGlobal() with this option specified may result in undefined behavior unless
+     * the variables are explicitly specified using ::CU_JIT_REFERENCED_VARIABLE_NAMES.\n
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES = 29,
+
+    /**
+     * Generate position independent code (0: false)\n
+     * Option type: int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_POSITION_INDEPENDENT_CODE = 30,
+
+    /**
+     * This option hints to the JIT compiler the minimum number of CTAs from the
+     * kernel’s grid to be mapped to a SM. This option is ignored when used together
+     * with ::CU_JIT_MAX_REGISTERS or ::CU_JIT_THREADS_PER_BLOCK.
+     * Optimizations based on this option need ::CU_JIT_MAX_THREADS_PER_BLOCK to
+     * be specified as well. For kernels already using PTX directive .minnctapersm,
+     * this option will be ignored by default. Use ::CU_JIT_OVERRIDE_DIRECTIVE_VALUES
+     * to let this option take precedence over the PTX directive.
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+    */
+    CU_JIT_MIN_CTA_PER_SM = 31,
+
+     /**
+     * Maximum number threads in a thread block, computed as the product of
+     * the maximum extent specifed for each dimension of the block. This limit
+     * is guaranteed not to be exeeded in any invocation of the kernel. Exceeding
+     * the the maximum number of threads results in runtime error or kernel launch
+     * failure. For kernels already using PTX directive .maxntid, this option will
+     * be ignored by default. Use ::CU_JIT_OVERRIDE_DIRECTIVE_VALUES to let this
+     * option take precedence over the PTX directive.
+     * Option type: int\n
+     * Applies to: compiler only
+    */
+    CU_JIT_MAX_THREADS_PER_BLOCK = 32,
+
+    /**
+     * This option lets the values specified using ::CU_JIT_MAX_REGISTERS,
+     * ::CU_JIT_THREADS_PER_BLOCK, ::CU_JIT_MAX_THREADS_PER_BLOCK and
+     * ::CU_JIT_MIN_CTA_PER_SM take precedence over any PTX directives.
+     * (0: Disable, default; 1: Enable)
+     * Option type: int\n
+     * Applies to: compiler only
+    */
+    CU_JIT_OVERRIDE_DIRECTIVE_VALUES = 33,
+    CU_JIT_NUM_OPTIONS
+
+} CUjit_option;
+
+/*
+ * Indicates that compute device class supports accelerated features.
+ */
+#define CU_COMPUTE_ACCELERATED_TARGET_BASE   0x10000
+
+/**
+ * Online compilation targets
+ */
+typedef enum CUjit_target_enum
+{
+    CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */
+    CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */
+    CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */
+    CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */
+    CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */
+    CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */
+    CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */
+    CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/
+    CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/
+    CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/
+    CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/
+    CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/
+    CU_TARGET_COMPUTE_75 = 75, /**< Compute device class 7.5.*/
+    CU_TARGET_COMPUTE_80 = 80, /**< Compute device class 8.0.*/
+    CU_TARGET_COMPUTE_86 = 86, /**< Compute device class 8.6.*/
+    CU_TARGET_COMPUTE_87 = 87, /**< Compute device class 8.7.*/
+    CU_TARGET_COMPUTE_89 = 89, /**< Compute device class 8.9.*/
+    CU_TARGET_COMPUTE_90 = 90, /**< Compute device class 9.0.*/
+    CU_TARGET_COMPUTE_100 = 100, /**< Compute device class 10.0.*/
+    CU_TARGET_COMPUTE_101 = 101,       /**< Compute device class 10.1.*/
+    CU_TARGET_COMPUTE_120 = 120, /**< Compute device class 12.0.*/
+
+    /**< Compute device class 9.0. with accelerated features.*/
+    CU_TARGET_COMPUTE_90A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_90,
+    /**< Compute device class 10.0. with accelerated features.*/
+    CU_TARGET_COMPUTE_100A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_100,
+    /**< Compute device class 10.1 with accelerated features.*/
+    CU_TARGET_COMPUTE_101A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_101,
+    /**< Compute device class 12.0. with accelerated features.*/
+    CU_TARGET_COMPUTE_120A = CU_COMPUTE_ACCELERATED_TARGET_BASE + CU_TARGET_COMPUTE_120,
+} CUjit_target;
+
+/**
+ * Cubin matching fallback strategies
+ */
+typedef enum CUjit_fallback_enum
+{
+    CU_PREFER_PTX = 0,  /**< Prefer to compile ptx if exact binary match not found */
+
+    CU_PREFER_BINARY    /**< Prefer to fall back to compatible binary code if exact match not found */
+
+} CUjit_fallback;
+
+/**
+ * Caching modes for dlcm
+ */
+typedef enum CUjit_cacheMode_enum
+{
+    CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */
+    CU_JIT_CACHE_OPTION_CG,       /**< Compile with L1 cache disabled */
+    CU_JIT_CACHE_OPTION_CA        /**< Compile with L1 cache enabled */
+} CUjit_cacheMode;
+
+/**
+ * Device code formats
+ */
+typedef enum CUjitInputType_enum
+{
+    /**
+     * Compiled device-class-specific device code\n
+     * Applicable options: none
+     */
+    CU_JIT_INPUT_CUBIN = 0,
+
+    /**
+     * PTX source code\n
+     * Applicable options: PTX compiler options
+     */
+    CU_JIT_INPUT_PTX = 1,
+
+    /**
+     * Bundle of multiple cubins and/or PTX of some device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_FATBINARY = 2,
+
+    /**
+     * Host object with embedded device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_OBJECT = 3,
+
+    /**
+     * Archive of host objects with embedded device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_LIBRARY = 4,
+
+    /**
+     * \deprecated
+     * High-level intermediate code for link-time optimization\n
+     * Applicable options: NVVM compiler options, PTX compiler options
+     *
+     * Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+     */
+    CU_JIT_INPUT_NVVM = 5,
+
+    CU_JIT_NUM_INPUT_TYPES = 6
+} CUjitInputType;
+
+typedef struct CUlinkState_st *CUlinkState;
+
+/**
+ * Flags to register a graphics resource
+ */
+typedef enum CUgraphicsRegisterFlags_enum {
+    CU_GRAPHICS_REGISTER_FLAGS_NONE           = 0x00,
+    CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY      = 0x01,
+    CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD  = 0x02,
+    CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST   = 0x04,
+    CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08
+} CUgraphicsRegisterFlags;
+
+/**
+ * Flags for mapping and unmapping interop resources
+ */
+typedef enum CUgraphicsMapResourceFlags_enum {
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
+} CUgraphicsMapResourceFlags;
+
+/**
+ * Array indices for cube faces
+ */
+typedef enum CUarray_cubemap_face_enum {
+    CU_CUBEMAP_FACE_POSITIVE_X  = 0x00, /**< Positive X face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_X  = 0x01, /**< Negative X face of cubemap */
+    CU_CUBEMAP_FACE_POSITIVE_Y  = 0x02, /**< Positive Y face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_Y  = 0x03, /**< Negative Y face of cubemap */
+    CU_CUBEMAP_FACE_POSITIVE_Z  = 0x04, /**< Positive Z face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_Z  = 0x05  /**< Negative Z face of cubemap */
+} CUarray_cubemap_face;
+
+/**
+ * Limits
+ */
+typedef enum CUlimit_enum {
+    CU_LIMIT_STACK_SIZE                       = 0x00, /**< GPU thread stack size */
+    CU_LIMIT_PRINTF_FIFO_SIZE                 = 0x01, /**< GPU printf FIFO size */
+    CU_LIMIT_MALLOC_HEAP_SIZE                 = 0x02, /**< GPU malloc heap size */
+    CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH           = 0x03, /**< GPU device runtime launch synchronize depth */
+    CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */
+    CU_LIMIT_MAX_L2_FETCH_GRANULARITY         = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */
+    CU_LIMIT_PERSISTING_L2_CACHE_SIZE         = 0x06, /**< A size in bytes for L2 persisting lines cache size */
+    CU_LIMIT_SHMEM_SIZE                       = 0x07, /**< A maximum size in bytes of shared memory available to CUDA kernels on a CIG context. Can only be queried, cannot be set */
+    CU_LIMIT_CIG_ENABLED                      = 0x08, /**< A non-zero value indicates this CUDA context is a CIG-enabled context. Can only be queried, cannot be set */
+    CU_LIMIT_CIG_SHMEM_FALLBACK_ENABLED       = 0x09, /**< When set to zero, CUDA will fail to launch a kernel on a CIG context, instead of using the fallback path, if the kernel uses more shared memory than available */
+    CU_LIMIT_MAX
+} CUlimit;
+
+/**
+ * Resource types
+ */
+typedef enum CUresourcetype_enum {
+    CU_RESOURCE_TYPE_ARRAY           = 0x00, /**< Array resource */
+    CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
+    CU_RESOURCE_TYPE_LINEAR          = 0x02, /**< Linear resource */
+    CU_RESOURCE_TYPE_PITCH2D         = 0x03  /**< Pitch 2D resource */
+} CUresourcetype;
+
+#ifdef _WIN32
+#define CUDA_CB __stdcall
+#else
+#define CUDA_CB
+#endif
+
+/**
+ * CUDA host function
+ * \param userData Argument value passed to the function
+ */
+typedef void (CUDA_CB *CUhostFn)(void *userData);
+
+/**
+ * Specifies performance hint with ::CUaccessPolicyWindow for hitProp and missProp members.
+ */
+typedef enum CUaccessProperty_enum {
+    CU_ACCESS_PROPERTY_NORMAL           = 0,    /**< Normal cache persistence. */
+    CU_ACCESS_PROPERTY_STREAMING        = 1,    /**< Streaming access is less likely to persit from cache. */
+    CU_ACCESS_PROPERTY_PERSISTING       = 2     /**< Persisting access is more likely to persist in cache.*/
+} CUaccessProperty;
+
+/**
+ * Specifies an access policy for a window, a contiguous extent of memory
+ * beginning at base_ptr and ending at base_ptr + num_bytes.
+ * num_bytes is limited by CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE.
+ * Partition into many segments and assign segments such that:
+ * sum of "hit segments" / window == approx. ratio.
+ * sum of "miss segments" / window == approx 1-ratio.
+ * Segments and ratio specifications are fitted to the capabilities of
+ * the architecture.
+ * Accesses in a hit segment apply the hitProp access policy.
+ * Accesses in a miss segment apply the missProp access policy.
+ */
+typedef struct CUaccessPolicyWindow_st {
+    void *base_ptr;                     /**< Starting address of the access policy window. CUDA driver may align it. */
+    size_t num_bytes;                   /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */
+    float hitRatio;                     /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */
+    CUaccessProperty hitProp;           /**< ::CUaccessProperty set for hit. */
+    CUaccessProperty missProp;          /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING */
+} CUaccessPolicyWindow_v1;
+/**
+ * Access policy window
+ */
+typedef CUaccessPolicyWindow_v1 CUaccessPolicyWindow;
+
+/**
+ * GPU kernel node parameters
+ */
+typedef struct CUDA_KERNEL_NODE_PARAMS_st {
+    CUfunction func;             /**< Kernel to launch */
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    void **kernelParams;         /**< Array of pointers to kernel parameters */
+    void **extra;                /**< Extra options */
+} CUDA_KERNEL_NODE_PARAMS_v1;
+
+/**
+ * GPU kernel node parameters
+ */
+typedef struct CUDA_KERNEL_NODE_PARAMS_v2_st {
+    CUfunction func;             /**< Kernel to launch */
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    void **kernelParams;         /**< Array of pointers to kernel parameters */
+    void **extra;                /**< Extra options */
+    CUkernel kern;               /**< Kernel to launch, will only be referenced if func is NULL */
+    CUcontext ctx;               /**< Context for the kernel task to run in. The value NULL will indicate the current context should be used by the api. This field is ignored if func is set. */
+} CUDA_KERNEL_NODE_PARAMS_v2;
+typedef CUDA_KERNEL_NODE_PARAMS_v2 CUDA_KERNEL_NODE_PARAMS;
+
+/**
+ * GPU kernel node parameters
+ */
+typedef struct CUDA_KERNEL_NODE_PARAMS_v3_st {
+    CUfunction func;             /**< Kernel to launch */
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    void **kernelParams;         /**< Array of pointers to kernel parameters */
+    void **extra;                /**< Extra options */
+    CUkernel kern;               /**< Kernel to launch, will only be referenced if func is NULL */
+    CUcontext ctx;               /**< Context for the kernel task to run in. The value NULL will indicate the current context should be used by the api. This field is ignored if func is set. */
+} CUDA_KERNEL_NODE_PARAMS_v3;
+
+/**
+ * Memset node parameters
+ */
+typedef struct CUDA_MEMSET_NODE_PARAMS_st {
+    CUdeviceptr dst;                        /**< Destination device pointer */
+    size_t pitch;                           /**< Pitch of destination device pointer. Unused if height is 1 */
+    unsigned int value;                     /**< Value to be set */
+    unsigned int elementSize;               /**< Size of each element in bytes. Must be 1, 2, or 4. */
+    size_t width;                           /**< Width of the row in elements */
+    size_t height;                          /**< Number of rows */
+} CUDA_MEMSET_NODE_PARAMS_v1;
+typedef CUDA_MEMSET_NODE_PARAMS_v1 CUDA_MEMSET_NODE_PARAMS;
+
+/**
+ * Memset node parameters
+ */
+typedef struct CUDA_MEMSET_NODE_PARAMS_v2_st {
+    CUdeviceptr dst;                        /**< Destination device pointer */
+    size_t pitch;                           /**< Pitch of destination device pointer. Unused if height is 1 */
+    unsigned int value;                     /**< Value to be set */
+    unsigned int elementSize;               /**< Size of each element in bytes. Must be 1, 2, or 4. */
+    size_t width;                           /**< Width of the row in elements */
+    size_t height;                          /**< Number of rows */
+    CUcontext ctx;                          /**< Context on which to run the node */
+} CUDA_MEMSET_NODE_PARAMS_v2;
+
+/**
+ * Host node parameters
+ */
+typedef struct CUDA_HOST_NODE_PARAMS_st {
+    CUhostFn fn;    /**< The function to call when the node executes */
+    void* userData; /**< Argument to pass to the function */
+} CUDA_HOST_NODE_PARAMS_v1;
+typedef CUDA_HOST_NODE_PARAMS_v1 CUDA_HOST_NODE_PARAMS;
+
+/**
+ * Host node parameters
+ */
+typedef struct CUDA_HOST_NODE_PARAMS_v2_st {
+    CUhostFn fn;    /**< The function to call when the node executes */
+    void* userData; /**< Argument to pass to the function */
+} CUDA_HOST_NODE_PARAMS_v2;
+
+/**
+ * Conditional node handle flags
+ */
+#define CU_GRAPH_COND_ASSIGN_DEFAULT   0x1 /**< Default value is applied when graph is launched. */
+
+/**
+ * Conditional node types
+ */
+typedef enum CUgraphConditionalNodeType_enum {
+     CU_GRAPH_COND_TYPE_IF = 0,     /**< Conditional 'if/else' Node. Body[0] executed if condition is non-zero.  If \p size == 2, an optional ELSE graph is created and this is executed if the condition is zero. */
+     CU_GRAPH_COND_TYPE_WHILE = 1,  /**< Conditional 'while' Node. Body executed repeatedly while condition value is non-zero. */
+     CU_GRAPH_COND_TYPE_SWITCH = 2, /**< Conditional 'switch' Node. Body[n] is executed once, where 'n' is the value of the condition. If the condition does not match a body index, no body is launched. */
+} CUgraphConditionalNodeType;
+
+/**
+ * Conditional node parameters
+ */
+typedef struct CUDA_CONDITIONAL_NODE_PARAMS {
+    CUgraphConditionalHandle handle;   /**< Conditional node handle.
+                                            Handles must be created in advance of creating the node
+                                            using ::cuGraphConditionalHandleCreate. */
+    CUgraphConditionalNodeType type;   /**< Type of conditional node. */
+    unsigned int size;                 /**< Size of graph output array.  Allowed values are 1 for CU_GRAPH_COND_TYPE_WHILE, 1 or 2
+                                            for CU_GRAPH_COND_TYPE_IF, or any value greater than zero for CU_GRAPH_COND_TYPE_SWITCH. */
+    CUgraph *phGraph_out;              /**< CUDA-owned array populated with conditional node child graphs during creation of the node.
+                                            Valid for the lifetime of the conditional node.
+                                            The contents of the graph(s) are subject to the following constraints:
+
+                                            - Allowed node types are kernel nodes, empty nodes, child graphs, memsets,
+                                              memcopies, and conditionals. This applies recursively to child graphs and conditional bodies.
+                                            - All kernels, including kernels in nested conditionals or child graphs at any level,
+                                              must belong to the same CUDA context.
+
+                                            These graphs may be populated using graph node creation APIs or ::cuStreamBeginCaptureToGraph.
+
+                                            CU_GRAPH_COND_TYPE_IF:
+                                            phGraph_out[0] is executed when the condition is non-zero.  If \p size == 2, phGraph_out[1] will
+                                            be executed when the condition is zero.
+                                            CU_GRAPH_COND_TYPE_WHILE:
+                                            phGraph_out[0] is executed as long as the condition is non-zero.
+                                            CU_GRAPH_COND_TYPE_SWITCH:
+                                            phGraph_out[n] is executed when the condition is equal to n.  If the condition >= \p size,
+                                            no body graph is executed.
+                                         */
+    CUcontext ctx;                     /**< Context on which to run the node.  Must match context used to create the handle and all body nodes. */
+} CUDA_CONDITIONAL_NODE_PARAMS;
+
+/**
+ * Graph node types
+ */
+typedef enum CUgraphNodeType_enum {
+    CU_GRAPH_NODE_TYPE_KERNEL           = 0, /**< GPU kernel node */
+    CU_GRAPH_NODE_TYPE_MEMCPY           = 1, /**< Memcpy node */
+    CU_GRAPH_NODE_TYPE_MEMSET           = 2, /**< Memset node */
+    CU_GRAPH_NODE_TYPE_HOST             = 3, /**< Host (executable) node */
+    CU_GRAPH_NODE_TYPE_GRAPH            = 4, /**< Node which executes an embedded graph */
+    CU_GRAPH_NODE_TYPE_EMPTY            = 5, /**< Empty (no-op) node */
+    CU_GRAPH_NODE_TYPE_WAIT_EVENT       = 6, /**< External event wait node */
+    CU_GRAPH_NODE_TYPE_EVENT_RECORD     = 7, /**< External event record node */
+    CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL = 8, /**< External semaphore signal node */
+    CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT   = 9, /**< External semaphore wait node */
+    CU_GRAPH_NODE_TYPE_MEM_ALLOC        = 10,/**< Memory Allocation Node */
+    CU_GRAPH_NODE_TYPE_MEM_FREE         = 11,/**< Memory Free Node */
+    CU_GRAPH_NODE_TYPE_BATCH_MEM_OP     = 12,/**< Batch MemOp Node */
+    CU_GRAPH_NODE_TYPE_CONDITIONAL      = 13 /**< Conditional Node
+
+                                                  May be used to implement a conditional execution path or loop
+                                                  inside of a graph. The graph(s) contained within the body of the conditional node
+                                                  can be selectively executed or iterated upon based on the value of a conditional
+                                                  variable.
+
+                                                  Handles must be created in advance of creating the node
+                                                  using ::cuGraphConditionalHandleCreate.
+
+                                                  The following restrictions apply to graphs which contain conditional nodes:
+                                                   The graph cannot be used in a child node.
+                                                   Only one instantiation of the graph may exist at any point in time.
+                                                   The graph cannot be cloned.
+
+                                                  To set the control value, supply a default value when creating the handle and/or
+                                                  call ::cudaGraphSetConditional from device code.*/
+} CUgraphNodeType;
+
+/**
+ * Type annotations that can be applied to graph edges as part of ::CUgraphEdgeData.
+ */
+typedef enum CUgraphDependencyType_enum {
+    CU_GRAPH_DEPENDENCY_TYPE_DEFAULT = 0, /**< This is an ordinary dependency. */
+    CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC = 1  /**< This dependency type allows the downstream node to
+                                                    use \c cudaGridDependencySynchronize(). It may only be used
+                                                    between kernel nodes, and must be used with either the
+                                                    ::CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC or
+                                                    ::CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER outgoing port. */
+} CUgraphDependencyType;
+
+/**
+ * This port activates when the kernel has finished executing.
+ */
+#define CU_GRAPH_KERNEL_NODE_PORT_DEFAULT 0
+/**
+ * This port activates when all blocks of the kernel have performed cudaTriggerProgrammaticLaunchCompletion()
+ * or have terminated. It must be used with edge type ::CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC. See also
+ * ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT.
+ */
+#define CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC 1
+/**
+ * This port activates when all blocks of the kernel have begun execution. See also
+ * ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT.
+ */
+#define CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER 2
+
+/**
+ * Optional annotation for edges in a CUDA graph. Note, all edges implicitly have annotations and
+ * default to a zero-initialized value if not specified. A zero-initialized struct indicates a
+ * standard full serialization of two nodes with memory visibility.
+ */
+typedef struct CUgraphEdgeData_st {
+    unsigned char from_port; /**< This indicates when the dependency is triggered from the upstream
+                                  node on the edge. The meaning is specfic to the node type. A value
+                                  of 0 in all cases means full completion of the upstream node, with
+                                  memory visibility to the downstream node or portion thereof
+                                  (indicated by \c to_port).
+                                  <br>
+                                  Only kernel nodes define non-zero ports. A kernel node
+                                  can use the following output port types:
+                                  ::CU_GRAPH_KERNEL_NODE_PORT_DEFAULT, ::CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC,
+                                  or ::CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER. */
+    unsigned char to_port; /**< This indicates what portion of the downstream node is dependent on
+                                the upstream node or portion thereof (indicated by \c from_port). The
+                                meaning is specific to the node type. A value of 0 in all cases means
+                                the entirety of the downstream node is dependent on the upstream work.
+                                <br>
+                                Currently no node types define non-zero ports. Accordingly, this field
+                                must be set to zero. */
+    unsigned char type; /**< This should be populated with a value from ::CUgraphDependencyType. (It
+                             is typed as char due to compiler-specific layout of bitfields.) See
+                             ::CUgraphDependencyType. */
+    unsigned char reserved[5]; /**< These bytes are unused and must be zeroed. This ensures
+                                    compatibility if additional fields are added in the future. */
+} CUgraphEdgeData;
+
+/**
+ * Graph instantiation results
+*/
+typedef enum CUgraphInstantiateResult_enum
+{
+    CUDA_GRAPH_INSTANTIATE_SUCCESS = 0,                          /**< Instantiation succeeded */
+    CUDA_GRAPH_INSTANTIATE_ERROR = 1,                            /**< Instantiation failed for an unexpected reason which is described in the return value of the function */
+    CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE = 2,                /**< Instantiation failed due to invalid structure, such as cycles */
+    CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED = 3,     /**< Instantiation for device launch failed because the graph contained an unsupported operation */
+    CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED = 4,      /**< Instantiation for device launch failed due to the nodes belonging to different contexts */
+    CUDA_GRAPH_INSTANTIATE_CONDITIONAL_HANDLE_UNUSED = 5,        /**< One or more conditional handles are not associated with conditional nodes */
+} CUgraphInstantiateResult;
+
+/**
+ * Graph instantiation parameters
+ */
+typedef struct CUDA_GRAPH_INSTANTIATE_PARAMS_st
+{
+	cuuint64_t flags;                    /**< Instantiation flags */
+	CUstream hUploadStream;              /**< Upload stream */
+	CUgraphNode hErrNode_out;            /**< The node which caused instantiation to fail, if any */
+	CUgraphInstantiateResult result_out; /**< Whether instantiation was successful.  If it failed, the reason why */
+} CUDA_GRAPH_INSTANTIATE_PARAMS;
+
+typedef enum CUsynchronizationPolicy_enum {
+    CU_SYNC_POLICY_AUTO = 1,
+    CU_SYNC_POLICY_SPIN = 2,
+    CU_SYNC_POLICY_YIELD = 3,
+    CU_SYNC_POLICY_BLOCKING_SYNC = 4
+} CUsynchronizationPolicy;
+
+/**
+ * Cluster scheduling policies. These may be passed to ::cuFuncSetAttribute or ::cuKernelSetAttribute
+ */
+typedef enum CUclusterSchedulingPolicy_enum {
+    CU_CLUSTER_SCHEDULING_POLICY_DEFAULT        = 0, /**< the default policy */
+    CU_CLUSTER_SCHEDULING_POLICY_SPREAD         = 1, /**< spread the blocks within a cluster to the SMs */
+    CU_CLUSTER_SCHEDULING_POLICY_LOAD_BALANCING = 2  /**< allow the hardware to load-balance the blocks in a cluster to the SMs */
+} CUclusterSchedulingPolicy;
+
+/**
+ * Memory Synchronization Domain
+ *
+ * A kernel can be launched in a specified memory synchronization domain that affects all memory operations issued by
+ * that kernel. A memory barrier issued in one domain will only order memory operations in that domain, thus eliminating
+ * latency increase from memory barriers ordering unrelated traffic.
+ *
+ * By default, kernels are launched in domain 0. Kernel launched with ::CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE will have a
+ * different domain ID. User may also alter the domain ID with ::CUlaunchMemSyncDomainMap for a specific stream /
+ * graph node / kernel launch. See ::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN, ::cuStreamSetAttribute, ::cuLaunchKernelEx,
+ * ::cuGraphKernelNodeSetAttribute.
+ *
+ * Memory operations done in kernels launched in different domains are considered system-scope distanced. In other
+ * words, a GPU scoped memory synchronization is not sufficient for memory order to be observed by kernels in another
+ * memory synchronization domain even if they are on the same GPU.
+ */
+typedef enum CUlaunchMemSyncDomain_enum {
+    CU_LAUNCH_MEM_SYNC_DOMAIN_DEFAULT = 0,    /**< Launch kernels in the default domain */
+    CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE  = 1     /**< Launch kernels in the remote domain */
+} CUlaunchMemSyncDomain;
+
+/**
+ * Memory Synchronization Domain map
+ *
+ * See ::cudaLaunchMemSyncDomain.
+ *
+ * By default, kernels are launched in domain 0. Kernel launched with ::CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE will have a
+ * different domain ID. User may also alter the domain ID with ::CUlaunchMemSyncDomainMap for a specific stream /
+ * graph node / kernel launch. See ::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP.
+ *
+ * Domain ID range is available through ::CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT.
+ */
+typedef struct CUlaunchMemSyncDomainMap_st {
+    unsigned char default_;     /**< The default domain ID to use for designated kernels */
+    unsigned char remote;       /**< The remote domain ID to use for designated kernels */
+} CUlaunchMemSyncDomainMap;
+
+/**
+ * Launch attributes enum; used as id field of ::CUlaunchAttribute
+ */
+typedef enum CUlaunchAttributeID_enum {
+    CU_LAUNCH_ATTRIBUTE_IGNORE = 0 /**< Ignored entry, for convenient composition */
+  , CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW   = 1 /**< Valid for streams, graph nodes, launches. See
+                                                      ::CUlaunchAttributeValue::accessPolicyWindow. */
+  , CU_LAUNCH_ATTRIBUTE_COOPERATIVE            = 2 /**< Valid for graph nodes, launches. See
+                                                      ::CUlaunchAttributeValue::cooperative. */
+  , CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3 /**< Valid for streams. See
+                                                      ::CUlaunchAttributeValue::syncPolicy. */
+  , CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION                    = 4 /**< Valid for graph nodes, launches. See ::CUlaunchAttributeValue::clusterDim. */
+  , CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 5 /**< Valid for graph nodes, launches. See ::CUlaunchAttributeValue::clusterSchedulingPolicyPreference. */
+  , CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION    = 6 /**< Valid for launches. Setting
+                                                                  ::CUlaunchAttributeValue::programmaticStreamSerializationAllowed
+                                                                  to non-0 signals that the kernel will use programmatic
+                                                                  means to resolve its stream dependency, so that the
+                                                                  CUDA runtime should opportunistically allow the grid's
+                                                                  execution to overlap with the previous kernel in the
+                                                                  stream, if that kernel requests the overlap. The
+                                                                  dependent launches can choose to wait on the
+                                                                  dependency using the programmatic sync
+                                                                  (cudaGridDependencySynchronize() or equivalent PTX
+                                                                  instructions). */
+  , CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT                   = 7 /**< Valid for launches. Set
+                                                                      ::CUlaunchAttributeValue::programmaticEvent to
+                                                                      record the event. Event recorded through this
+                                                                      launch attribute is guaranteed to only trigger
+                                                                      after all block in the associated kernel trigger
+                                                                      the event. A block can trigger the event through
+                                                                      PTX launchdep.release or CUDA builtin function
+                                                                      cudaTriggerProgrammaticLaunchCompletion(). A
+                                                                      trigger can also be inserted at the beginning of
+                                                                      each block's execution if triggerAtBlockStart is
+                                                                      set to non-0. The dependent launches can choose to
+                                                                      wait on the dependency using the programmatic sync
+                                                                      (cudaGridDependencySynchronize() or equivalent PTX
+                                                                      instructions). Note that dependents (including the
+                                                                      CPU thread calling cuEventSynchronize()) are not
+                                                                      guaranteed to observe the release precisely when
+                                                                      it is released.  For example, cuEventSynchronize()
+                                                                      may only observe the event trigger long after the
+                                                                      associated kernel has completed. This recording
+                                                                      type is primarily meant for establishing
+                                                                      programmatic dependency between device tasks. Note
+                                                                      also this type of dependency allows, but does not
+                                                                      guarantee, concurrent execution of tasks.
+                                                                      <br>
+                                                                      The event supplied must not be an interprocess or
+                                                                      interop event. The event must disable timing (i.e.
+                                                                      must be created with the ::CU_EVENT_DISABLE_TIMING
+                                                                      flag set).
+                                                                      */
+  , CU_LAUNCH_ATTRIBUTE_PRIORITY               = 8 /**< Valid for streams, graph nodes, launches. See
+                                                        ::CUlaunchAttributeValue::priority. */
+  , CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP    = 9 /**< Valid for streams, graph nodes, launches. See
+                                                      ::CUlaunchAttributeValue::memSyncDomainMap. */
+  , CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN        = 10 /**< Valid for streams, graph nodes, launches. See
+                                                       ::CUlaunchAttributeValue::memSyncDomain. */
+  , CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION = 11 /**< Valid for graph nodes, launches. Set
+                                                              ::CUlaunchAttributeValue::preferredClusterDim
+                                                              to allow the kernel launch to specify a preferred substitute
+                                                              cluster dimension. Blocks may be grouped according to either
+                                                              the dimensions specified with this attribute (grouped into a
+                                                              "preferred substitute cluster"), or the one specified with
+                                                              ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION attribute (grouped
+                                                              into a "regular cluster"). The cluster dimensions of a
+                                                              "preferred substitute cluster" shall be an integer multiple
+                                                              greater than zero of the regular cluster dimensions. The
+                                                              device will attempt - on a best-effort basis - to group
+                                                              thread blocks into preferred clusters over grouping them
+                                                              into regular clusters. When it deems necessary (primarily
+                                                              when the device temporarily runs out of physical resources
+                                                              to launch the larger preferred clusters), the device may
+                                                              switch to launch the regular clusters instead to attempt to
+                                                              utilize as much of the physical device resources as possible.
+                                                              <br>
+                                                              Each type of cluster will have its enumeration / coordinate
+                                                              setup as if the grid consists solely of its type of cluster.
+                                                              For example, if the preferred substitute cluster dimensions
+                                                              double the regular cluster dimensions, there might be
+                                                              simultaneously a regular cluster indexed at (1,0,0), and a
+                                                              preferred cluster indexed at (1,0,0). In this example, the
+                                                              preferred substitute cluster (1,0,0) replaces regular
+                                                              clusters (2,0,0) and (3,0,0) and groups their blocks.
+                                                              <br>
+                                                              This attribute will only take effect when a regular cluster
+                                                              dimension has been specified. The preferred substitute
+                                                              cluster dimension must be an integer multiple greater than
+                                                              zero of the regular cluster dimension and must divide the
+                                                              grid. It must also be no more than `maxBlocksPerCluster`, if
+                                                              it is set in the kernel's `__launch_bounds__`. Otherwise it
+                                                              must be less than the maximum value the driver can support.
+                                                              Otherwise, setting this attribute to a value physically
+                                                              unable to fit on any particular device is permitted. */
+  , CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT = 12 /**< Valid for launches. Set
+                                                          ::CUlaunchAttributeValue::launchCompletionEvent to record the
+                                                          event.
+                                                          <br>
+                                                          Nominally, the event is triggered once all blocks of the kernel
+                                                          have begun execution. Currently this is a best effort. If a kernel
+                                                          B has a launch completion dependency on a kernel A, B may wait
+                                                          until A is complete. Alternatively, blocks of B may begin before
+                                                          all blocks of A have begun, for example if B can claim execution
+                                                          resources unavailable to A (e.g. they run on different GPUs) or
+                                                          if B is a higher priority than A.
+                                                          Exercise caution if such an ordering inversion could lead
+                                                          to deadlock.
+                                                          <br>
+                                                          A launch completion event is nominally similar to a programmatic
+                                                          event with \c triggerAtBlockStart set except that it is not
+                                                          visible to \c cudaGridDependencySynchronize() and can be used with
+                                                          compute capability less than 9.0.
+                                                          <br>
+                                                          The event supplied must not be an interprocess or interop
+                                                          event. The event must disable timing (i.e. must be created
+                                                          with the ::CU_EVENT_DISABLE_TIMING flag set). */
+  , CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE = 13 /**< Valid for graph nodes, launches. This attribute is graphs-only,
+                                                               and passing it to a launch in a non-capturing stream will result
+                                                               in an error.
+                                                               <br>
+                                                               ::CUlaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can 
+                                                               only be set to 0 or 1. Setting the field to 1 indicates that the
+                                                               corresponding kernel node should be device-updatable. On success, a handle
+                                                               will be returned via
+                                                               ::CUlaunchAttributeValue::deviceUpdatableKernelNode::devNode which can be
+                                                               passed to the various device-side update functions to update the node's
+                                                               kernel parameters from within another kernel. For more information on the
+                                                               types of device updates that can be made, as well as the relevant limitations
+                                                               thereof, see ::cudaGraphKernelNodeUpdatesApply.
+                                                               <br>
+                                                               Nodes which are device-updatable have additional restrictions compared to
+                                                               regular kernel nodes. Firstly, device-updatable nodes cannot be removed
+                                                               from their graph via ::cuGraphDestroyNode. Additionally, once opted-in
+                                                               to this functionality, a node cannot opt out, and any attempt to set the
+                                                               deviceUpdatable attribute to 0 will result in an error. Device-updatable
+                                                               kernel nodes also cannot have their attributes copied to/from another kernel
+                                                               node via ::cuGraphKernelNodeCopyAttributes. Graphs containing one or more
+                                                               device-updatable nodes also do not allow multiple instantiation, and neither
+                                                               the graph nor its instantiated version can be passed to ::cuGraphExecUpdate.
+                                                               <br>
+                                                               If a graph contains device-updatable nodes and updates those nodes from the device
+                                                               from within the graph, the graph must be uploaded with ::cuGraphUpload before it
+                                                               is launched. For such a graph, if host-side executable graph updates are made to the
+                                                               device-updatable nodes, the graph must be uploaded before it is launched again. */
+  , CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 14 /**< Valid for launches. On devices where the L1 cache and shared memory use the
+                                                                   same hardware resources, setting ::CUlaunchAttributeValue::sharedMemCarveout to a 
+                                                                   percentage between 0-100 signals the CUDA driver to set the shared memory carveout 
+                                                                   preference, in percent of the total shared memory for that kernel launch. 
+                                                                   This attribute takes precedence over ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
+                                                                   This is only a hint, and the CUDA driver can choose a different configuration if
+                                                                   required for the launch. */
+#if defined(__CUDA_API_VERSION_INTERNAL) && !defined(__CUDA_API_VERSION_INTERNAL_ODR)
+  , CU_LAUNCH_ATTRIBUTE_MAX
+#endif
+} CUlaunchAttributeID;
+
+/**
+ * Launch attributes union; used as value field of ::CUlaunchAttribute
+ */
+typedef union CUlaunchAttributeValue_union {
+    char pad[64]; /* Pad to 64 bytes */
+    CUaccessPolicyWindow accessPolicyWindow; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW. */
+    int cooperative; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_COOPERATIVE. Nonzero indicates a cooperative
+                        kernel (see ::cuLaunchCooperativeKernel). */
+    CUsynchronizationPolicy syncPolicy; /**< Value of launch attribute
+                                           ::CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. ::CUsynchronizationPolicy for
+                                           work queued up in this stream */
+
+    /**
+     *  Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION that
+     *  represents the desired cluster dimensions for the kernel. Opaque type
+     *  with the following fields:
+     *      - \p x - The X dimension of the cluster, in blocks. Must be a divisor
+     *               of the grid X dimension.
+     *      - \p y - The Y dimension of the cluster, in blocks. Must be a divisor
+     *               of the grid Y dimension.
+     *      - \p z - The Z dimension of the cluster, in blocks. Must be a divisor
+     *               of the grid Z dimension.
+     */
+    struct {
+        unsigned int x;
+        unsigned int y;
+        unsigned int z;
+    } clusterDim;
+    CUclusterSchedulingPolicy clusterSchedulingPolicyPreference; /**< Value of launch attribute
+                                                                    ::CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE. Cluster
+                                                                    scheduling policy preference for the kernel. */
+    int programmaticStreamSerializationAllowed;  /**< Value of launch attribute
+                                                   ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION. */
+    /**
+     *  Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
+     *  with the following fields:
+     *      - \p CUevent event - Event to fire when all blocks trigger it.
+     *      - \p Event record flags, see ::cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
+     *      - \p triggerAtBlockStart - If this is set to non-0, each block launch will automatically trigger the event.
+     */
+    struct {
+        CUevent event;
+        int flags;
+        int triggerAtBlockStart;
+    } programmaticEvent;
+    /**
+     * Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT
+     * with the following fields:
+     *     - \p CUevent event - Event to fire when the last block launches
+     *     - \p int flags; - Event record flags, see ::cuEventRecordWithFlags. Does not accept ::CU_EVENT_RECORD_EXTERNAL.
+     */ 
+    struct {
+        CUevent event;
+        int flags;
+    } launchCompletionEvent;
+    int priority; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PRIORITY. Execution priority of the kernel. */
+    CUlaunchMemSyncDomainMap memSyncDomainMap; /**< Value of launch attribute
+                                                  ::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP. See
+                                                  ::CUlaunchMemSyncDomainMap. */
+    CUlaunchMemSyncDomain memSyncDomain;       /**< Value of launch attribute
+                                                  ::CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN. See::CUlaunchMemSyncDomain */
+    /**
+     *  Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION
+     *  that represents the desired preferred cluster dimensions for the kernel.
+     *  Opaque type with the following fields:
+     *      - \p x - The X dimension of the preferred cluster, in blocks. Must
+     *               be a divisor of the grid X dimension, and must be a
+     *               multiple of the \p x field of ::CUlaunchAttributeValue::clusterDim.
+     *      - \p y - The Y dimension of the preferred cluster, in blocks. Must
+     *               be a divisor of the grid Y dimension, and must be a
+     *               multiple of the \p y field of ::CUlaunchAttributeValue::clusterDim.
+     *      - \p z - The Z dimension of the preferred cluster, in blocks. Must be
+     *               equal to the \p z field of ::CUlaunchAttributeValue::clusterDim.
+     */
+    struct {
+        unsigned int x;
+        unsigned int y;
+        unsigned int z;
+    } preferredClusterDim;
+
+    /**
+     *  Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE.
+     *  with the following fields:
+     *      - \p int deviceUpdatable - Whether or not the resulting kernel node should be device-updatable.
+     *      - \p CUgraphDeviceNode devNode - Returns a handle to pass to the various device-side update functions.
+     */
+    struct {
+        int deviceUpdatable;
+        CUgraphDeviceNode devNode;
+    } deviceUpdatableKernelNode;
+    unsigned int sharedMemCarveout; /**< Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT. */
+} CUlaunchAttributeValue;
+
+/**
+ * Launch attribute
+ */
+typedef struct CUlaunchAttribute_st {
+    CUlaunchAttributeID id; /**< Attribute to set */
+    char pad[8 - sizeof(CUlaunchAttributeID)];
+    CUlaunchAttributeValue value; /**< Value of the attribute */
+} CUlaunchAttribute;
+
+/**
+ * CUDA extensible launch configuration
+ */
+typedef struct CUlaunchConfig_st {
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    CUstream hStream;            /**< Stream identifier */
+    CUlaunchAttribute *attrs;    /**< List of attributes; nullable if ::CUlaunchConfig::numAttrs == 0 */
+    unsigned int numAttrs;       /**< Number of attributes populated in ::CUlaunchConfig::attrs */
+} CUlaunchConfig;
+
+typedef CUlaunchAttributeID CUkernelNodeAttrID;
+#define CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW
+#define CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE          CU_LAUNCH_ATTRIBUTE_COOPERATIVE
+#define CU_KERNEL_NODE_ATTRIBUTE_CLUSTER_DIMENSION                    CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
+#define CU_KERNEL_NODE_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE
+#define CU_KERNEL_NODE_ATTRIBUTE_PRIORITY             CU_LAUNCH_ATTRIBUTE_PRIORITY
+#define CU_KERNEL_NODE_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP  CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP
+#define CU_KERNEL_NODE_ATTRIBUTE_MEM_SYNC_DOMAIN      CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN
+#define CU_KERNEL_NODE_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION
+#define CU_KERNEL_NODE_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE
+#define CU_KERNEL_NODE_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
+
+typedef CUlaunchAttributeValue CUkernelNodeAttrValue_v1;
+typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue;
+
+/**
+ * Possible stream capture statuses returned by ::cuStreamIsCapturing
+ */
+typedef enum CUstreamCaptureStatus_enum {
+    CU_STREAM_CAPTURE_STATUS_NONE        = 0, /**< Stream is not capturing */
+    CU_STREAM_CAPTURE_STATUS_ACTIVE      = 1, /**< Stream is actively capturing */
+    CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2  /**< Stream is part of a capture sequence that
+                                                   has been invalidated, but not terminated */
+} CUstreamCaptureStatus;
+
+/**
+ * Possible modes for stream capture thread interactions. For more details see
+ * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode
+ */
+typedef enum CUstreamCaptureMode_enum {
+    CU_STREAM_CAPTURE_MODE_GLOBAL       = 0,
+    CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1,
+    CU_STREAM_CAPTURE_MODE_RELAXED      = 2
+} CUstreamCaptureMode;
+
+typedef CUlaunchAttributeID CUstreamAttrID;
+#define CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW   CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW
+#define CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY
+#define CU_STREAM_ATTRIBUTE_PRIORITY               CU_LAUNCH_ATTRIBUTE_PRIORITY
+#define CU_STREAM_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP    CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP
+#define CU_STREAM_ATTRIBUTE_MEM_SYNC_DOMAIN        CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN
+
+typedef CUlaunchAttributeValue CUstreamAttrValue_v1;
+typedef CUstreamAttrValue_v1 CUstreamAttrValue;
+
+/**
+ * Flags to specify search options. For more details see ::cuGetProcAddress
+ */
+typedef enum CUdriverProcAddress_flags_enum {
+    CU_GET_PROC_ADDRESS_DEFAULT = 0,                        /**< Default search mode for driver symbols. */
+    CU_GET_PROC_ADDRESS_LEGACY_STREAM = 1 << 0,             /**< Search for legacy versions of driver symbols. */
+    CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM = 1 << 1  /**< Search for per-thread versions of driver symbols. */ 
+} CUdriverProcAddress_flags;
+
+/**
+ * Flags to indicate search status. For more details see ::cuGetProcAddress
+ */
+typedef enum CUdriverProcAddressQueryResult_enum {
+    CU_GET_PROC_ADDRESS_SUCCESS                = 0,  /**< Symbol was succesfully found */
+    CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND       = 1,  /**< Symbol was not found in search */
+    CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT = 2   /**< Symbol was found but version supplied was not sufficient */
+}  CUdriverProcAddressQueryResult;
+
+/**
+ * Execution Affinity Types 
+ */
+typedef enum CUexecAffinityType_enum {
+    CU_EXEC_AFFINITY_TYPE_SM_COUNT = 0,  /**< Create a context with limited SMs. */
+    CU_EXEC_AFFINITY_TYPE_MAX
+} CUexecAffinityType;
+
+/**
+ * Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT
+ */
+typedef struct CUexecAffinitySmCount_st {
+    unsigned int val;    /**< The number of SMs the context is limited to use. */
+} CUexecAffinitySmCount_v1;
+typedef CUexecAffinitySmCount_v1 CUexecAffinitySmCount;
+
+/**
+ * Execution Affinity Parameters 
+ */
+typedef struct CUexecAffinityParam_st {
+    CUexecAffinityType type;
+    union {
+        CUexecAffinitySmCount smCount;    /** Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT */
+    } param;
+} CUexecAffinityParam_v1;
+/**
+ * Execution Affinity Parameters
+ */
+typedef CUexecAffinityParam_v1 CUexecAffinityParam;
+
+typedef enum CUcigDataType_enum {
+    CIG_DATA_TYPE_D3D12_COMMAND_QUEUE = 0x1,    /** D3D12 Command Queue Handle */
+} CUcigDataType;
+
+/**
+* CIG Context Create Params
+*/
+typedef struct CUctxCigParam_st {
+    CUcigDataType sharedDataType;
+    void* sharedData;
+} CUctxCigParam;
+
+/**
+* Params for creating CUDA context
+* Exactly one of execAffinityParams and cigParams 
+* must be non-NULL.
+*/
+typedef struct CUctxCreateParams_st {
+    CUexecAffinityParam *execAffinityParams;
+    int                  numExecAffinityParams;
+    CUctxCigParam       *cigParams;
+} CUctxCreateParams;
+
+/**
+ * Library options to be specified with ::cuLibraryLoadData() or ::cuLibraryLoadFromFile()
+ */
+typedef enum CUlibraryOption_enum
+{
+    CU_LIBRARY_HOST_UNIVERSAL_FUNCTION_AND_DATA_TABLE = 0,
+
+    /**
+     * Specifes that the argument \p code passed to ::cuLibraryLoadData() will be preserved.
+     * Specifying this option will let the driver know that \p code can be accessed at any point
+     * until ::cuLibraryUnload(). The default behavior is for the driver to allocate and
+     * maintain its own copy of \p code. Note that this is only a memory usage optimization
+     * hint and the driver can choose to ignore it if required.
+     * Specifying this option with ::cuLibraryLoadFromFile() is invalid and
+     * will return ::CUDA_ERROR_INVALID_VALUE.
+     */
+    CU_LIBRARY_BINARY_IS_PRESERVED = 1,
+
+    CU_LIBRARY_NUM_OPTIONS
+} CUlibraryOption;
+
+typedef struct CUlibraryHostUniversalFunctionAndDataTable_st
+{
+    void *functionTable;
+    size_t functionWindowSize;
+    void *dataTable;
+    size_t dataWindowSize;
+} CUlibraryHostUniversalFunctionAndDataTable;
+
+/**
+ * Error codes
+ */
+typedef enum cudaError_enum {
+    /**
+     * The API call returned with no errors. In the case of query calls, this
+     * also means that the operation being queried is complete (see
+     * ::cuEventQuery() and ::cuStreamQuery()).
+     */
+    CUDA_SUCCESS                              = 0,
+
+    /**
+     * This indicates that one or more of the parameters passed to the API call
+     * is not within an acceptable range of values.
+     */
+    CUDA_ERROR_INVALID_VALUE                  = 1,
+
+    /**
+     * The API call failed because it was unable to allocate enough memory or
+     * other resources to perform the requested operation.
+     */
+    CUDA_ERROR_OUT_OF_MEMORY                  = 2,
+
+    /**
+     * This indicates that the CUDA driver has not been initialized with
+     * ::cuInit() or that initialization has failed.
+     */
+    CUDA_ERROR_NOT_INITIALIZED                = 3,
+
+    /**
+     * This indicates that the CUDA driver is in the process of shutting down.
+     */
+    CUDA_ERROR_DEINITIALIZED                  = 4,
+
+    /**
+     * This indicates profiler is not initialized for this run. This can
+     * happen when the application is running with external profiling tools
+     * like visual profiler.
+     */
+    CUDA_ERROR_PROFILER_DISABLED              = 5,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to attempt to enable/disable the profiling via ::cuProfilerStart or
+     * ::cuProfilerStop without initialization.
+     */
+    CUDA_ERROR_PROFILER_NOT_INITIALIZED       = 6,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cuProfilerStart() when profiling is already enabled.
+     */
+    CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cuProfilerStop() when profiling is already disabled.
+     */
+    CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
+
+    /**
+     * This indicates that the CUDA driver that the application has loaded is a
+     * stub library. Applications that run with the stub rather than a real
+     * driver loaded will result in CUDA API returning this error.
+     */
+    CUDA_ERROR_STUB_LIBRARY                   = 34,
+
+    /**  
+     * This indicates that requested CUDA device is unavailable at the current
+     * time. Devices are often unavailable due to use of
+     * ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED.
+     */
+    CUDA_ERROR_DEVICE_UNAVAILABLE            = 46,
+
+    /**
+     * This indicates that no CUDA-capable devices were detected by the installed
+     * CUDA driver.
+     */
+    CUDA_ERROR_NO_DEVICE                      = 100,
+
+    /**
+     * This indicates that the device ordinal supplied by the user does not
+     * correspond to a valid CUDA device or that the action requested is
+     * invalid for the specified device.
+     */
+    CUDA_ERROR_INVALID_DEVICE                 = 101,
+
+    /**
+     * This error indicates that the Grid license is not applied.
+     */
+    CUDA_ERROR_DEVICE_NOT_LICENSED            = 102,
+
+    /**
+     * This indicates that the device kernel image is invalid. This can also
+     * indicate an invalid CUDA module.
+     */
+    CUDA_ERROR_INVALID_IMAGE                  = 200,
+
+    /**
+     * This most frequently indicates that there is no context bound to the
+     * current thread. This can also be returned if the context passed to an
+     * API call is not a valid handle (such as a context that has had
+     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
+     * mixes different API versions (i.e. 3010 context with 3020 API calls).
+     * See ::cuCtxGetApiVersion() for more details.
+     * This can also be returned if the green context passed to an API call
+     * was not converted to a ::CUcontext using ::cuCtxFromGreenCtx API.
+     */
+    CUDA_ERROR_INVALID_CONTEXT                = 201,
+
+    /**
+     * This indicated that the context being supplied as a parameter to the
+     * API call was already the active context.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.2. It is no longer an
+     * error to attempt to push the active context via ::cuCtxPushCurrent().
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
+
+    /**
+     * This indicates that a map or register operation has failed.
+     */
+    CUDA_ERROR_MAP_FAILED                     = 205,
+
+    /**
+     * This indicates that an unmap or unregister operation has failed.
+     */
+    CUDA_ERROR_UNMAP_FAILED                   = 206,
+
+    /**
+     * This indicates that the specified array is currently mapped and thus
+     * cannot be destroyed.
+     */
+    CUDA_ERROR_ARRAY_IS_MAPPED                = 207,
+
+    /**
+     * This indicates that the resource is already mapped.
+     */
+    CUDA_ERROR_ALREADY_MAPPED                 = 208,
+
+    /**
+     * This indicates that there is no kernel image available that is suitable
+     * for the device. This can occur when a user specifies code generation
+     * options for a particular CUDA source file that do not include the
+     * corresponding device configuration.
+     */
+    CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,
+
+    /**
+     * This indicates that a resource has already been acquired.
+     */
+    CUDA_ERROR_ALREADY_ACQUIRED               = 210,
+
+    /**
+     * This indicates that a resource is not mapped.
+     */
+    CUDA_ERROR_NOT_MAPPED                     = 211,
+
+    /**
+     * This indicates that a mapped resource is not available for access as an
+     * array.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
+
+    /**
+     * This indicates that a mapped resource is not available for access as a
+     * pointer.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,
+
+    /**
+     * This indicates that an uncorrectable ECC error was detected during
+     * execution.
+     */
+    CUDA_ERROR_ECC_UNCORRECTABLE              = 214,
+
+    /**
+     * This indicates that the ::CUlimit passed to the API call is not
+     * supported by the active device.
+     */
+    CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,
+
+    /**
+     * This indicates that the ::CUcontext passed to the API call can
+     * only be bound to a single CPU thread at a time but is already
+     * bound to a CPU thread.
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_IN_USE         = 216,
+
+    /**
+     * This indicates that peer access is not supported across the given
+     * devices.
+     */
+    CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        = 217,
+
+    /**
+     * This indicates that a PTX JIT compilation failed.
+     */
+    CUDA_ERROR_INVALID_PTX                    = 218,
+
+    /**
+     * This indicates an error with OpenGL or DirectX context.
+     */
+    CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       = 219,
+
+    /**
+    * This indicates that an uncorrectable NVLink error was detected during the
+    * execution.
+    */
+    CUDA_ERROR_NVLINK_UNCORRECTABLE           = 220,
+
+    /**
+    * This indicates that the PTX JIT compiler library was not found.
+    */
+    CUDA_ERROR_JIT_COMPILER_NOT_FOUND         = 221,
+
+    /**
+     * This indicates that the provided PTX was compiled with an unsupported toolchain.
+     */
+
+    CUDA_ERROR_UNSUPPORTED_PTX_VERSION        = 222,
+
+    /**
+     * This indicates that the PTX JIT compilation was disabled.
+     */
+    CUDA_ERROR_JIT_COMPILATION_DISABLED       = 223,
+
+    /**
+     * This indicates that the ::CUexecAffinityType passed to the API call is not
+     * supported by the active device.
+     */ 
+    CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY      = 224,
+
+    /**
+     * This indicates that the code to be compiled by the PTX JIT contains
+     * unsupported call to cudaDeviceSynchronize.
+     */
+    CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC       = 225,
+
+    /**
+     * This indicates that an exception occurred on the device that is now
+     * contained by the GPU's error containment capability. Common causes are -
+     * a. Certain types of invalid accesses of peer GPU memory over nvlink
+     * b. Certain classes of hardware errors
+     * This leaves the process in an inconsistent state and any further CUDA
+     * work will return the same error. To continue using CUDA, the process must
+     * be terminated and relaunched.
+     */
+    CUDA_ERROR_CONTAINED                      = 226,
+
+    /**
+     * This indicates that the device kernel source is invalid. This includes
+     * compilation/linker errors encountered in device code or user error.
+     */
+    CUDA_ERROR_INVALID_SOURCE                 = 300,
+
+    /**
+     * This indicates that the file specified was not found.
+     */
+    CUDA_ERROR_FILE_NOT_FOUND                 = 301,
+
+    /**
+     * This indicates that a link to a shared object failed to resolve.
+     */
+    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
+
+    /**
+     * This indicates that initialization of a shared object failed.
+     */
+    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
+
+    /**
+     * This indicates that an OS call failed.
+     */
+    CUDA_ERROR_OPERATING_SYSTEM               = 304,
+
+    /**
+     * This indicates that a resource handle passed to the API call was not
+     * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
+     */
+    CUDA_ERROR_INVALID_HANDLE                 = 400,
+
+    /**
+     * This indicates that a resource required by the API call is not in a
+     * valid state to perform the requested operation.
+     */
+    CUDA_ERROR_ILLEGAL_STATE                  = 401,
+
+    /**
+     * This indicates an attempt was made to introspect an object in a way that
+     * would discard semantically important information. This is either due to
+     * the object using funtionality newer than the API version used to
+     * introspect it or omission of optional return arguments.
+     */
+    CUDA_ERROR_LOSSY_QUERY                    = 402,
+
+    /**
+     * This indicates that a named symbol was not found. Examples of symbols
+     * are global/constant variable names, driver function names, texture names,
+     * and surface names.
+     */
+    CUDA_ERROR_NOT_FOUND                      = 500,
+
+    /**
+     * This indicates that asynchronous operations issued previously have not
+     * completed yet. This result is not actually an error, but must be indicated
+     * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
+     * may return this value include ::cuEventQuery() and ::cuStreamQuery().
+     */
+    CUDA_ERROR_NOT_READY                      = 600,
+
+    /**
+     * While executing a kernel, the device encountered a
+     * load or store instruction on an invalid memory address.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_ILLEGAL_ADDRESS                = 700,
+
+    /**
+     * This indicates that a launch did not occur because it did not have
+     * appropriate resources. This error usually indicates that the user has
+     * attempted to pass too many arguments to the device kernel, or the
+     * kernel launch specifies too many threads for the kernel's register
+     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
+     * when a 32-bit int is expected) is equivalent to passing too many
+     * arguments and can also result in this error.
+     */
+    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
+
+    /**
+     * This indicates that the device kernel took too long to execute. This can
+     * only occur if timeouts are enabled - see the device attribute
+     * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,
+
+    /**
+     * This error indicates a kernel launch that uses an incompatible texturing
+     * mode.
+     */
+    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
+
+    /**
+     * This error indicates that a call to ::cuCtxEnablePeerAccess() is
+     * trying to re-enable peer access to a context which has already
+     * had peer access to it enabled.
+     */
+    CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    = 704,
+
+    /**
+     * This error indicates that ::cuCtxDisablePeerAccess() is
+     * trying to disable peer access which has not been enabled yet
+     * via ::cuCtxEnablePeerAccess().
+     */
+    CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        = 705,
+
+    /**
+     * This error indicates that the primary context for the specified device
+     * has already been initialized.
+     */
+    CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,
+
+    /**
+     * This error indicates that the context current to the calling thread
+     * has been destroyed using ::cuCtxDestroy, or is a primary context which
+     * has not yet been initialized.
+     */
+    CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,
+
+    /**
+     * A device-side assert triggered during kernel execution. The context
+     * cannot be used anymore, and must be destroyed. All existing device
+     * memory allocations from this context are invalid and must be
+     * reconstructed if the program is to continue using CUDA.
+     */
+    CUDA_ERROR_ASSERT                         = 710,
+
+    /**
+     * This error indicates that the hardware resources required to enable
+     * peer access have been exhausted for one or more of the devices
+     * passed to ::cuCtxEnablePeerAccess().
+     */
+    CUDA_ERROR_TOO_MANY_PEERS                 = 711,
+
+    /**
+     * This error indicates that the memory range passed to ::cuMemHostRegister()
+     * has already been registered.
+     */
+    CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
+
+    /**
+     * This error indicates that the pointer passed to ::cuMemHostUnregister()
+     * does not correspond to any currently registered memory region.
+     */
+    CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     = 713,
+
+    /**
+     * While executing a kernel, the device encountered a stack error.
+     * This can be due to stack corruption or exceeding the stack size limit.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_HARDWARE_STACK_ERROR           = 714,
+
+    /**
+     * While executing a kernel, the device encountered an illegal instruction.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_ILLEGAL_INSTRUCTION            = 715,
+
+    /**
+     * While executing a kernel, the device encountered a load or store instruction
+     * on a memory address which is not aligned.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_MISALIGNED_ADDRESS             = 716,
+
+    /**
+     * While executing a kernel, the device encountered an instruction
+     * which can only operate on memory locations in certain address spaces
+     * (global, shared, or local), but was supplied a memory address not
+     * belonging to an allowed address space.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_INVALID_ADDRESS_SPACE          = 717,
+
+    /**
+     * While executing a kernel, the device program counter wrapped its address space.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_INVALID_PC                     = 718,
+
+    /**
+     * An exception occurred on the device while executing a kernel. Common
+     * causes include dereferencing an invalid device pointer and accessing
+     * out of bounds shared memory. Less common cases can be system specific - more
+     * information about these cases can be found in the system specific user guide.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_LAUNCH_FAILED                  = 719,
+
+    /**
+     * This error indicates that the number of blocks launched per grid for a kernel that was
+     * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
+     * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
+     * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
+     * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+     */
+    CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE   = 720,
+
+    /**
+     * An exception occurred on the device while exiting a kernel using tensor memory: the
+     * tensor memory was not completely deallocated. This leaves the process in an inconsistent
+     * state and any further CUDA work will return the same error. To continue using CUDA, the
+     * process must be terminated and relaunched.
+     */
+    CUDA_ERROR_TENSOR_MEMORY_LEAK             = 721,
+
+    /**
+     * This error indicates that the attempted operation is not permitted.
+     */
+    CUDA_ERROR_NOT_PERMITTED                  = 800,
+
+    /**
+     * This error indicates that the attempted operation is not supported
+     * on the current system or device.
+     */
+    CUDA_ERROR_NOT_SUPPORTED                  = 801,
+
+    /**
+     * This error indicates that the system is not yet ready to start any CUDA
+     * work.  To continue using CUDA, verify the system configuration is in a
+     * valid state and all required driver daemons are actively running.
+     * More information about this error can be found in the system specific
+     * user guide.
+     */
+    CUDA_ERROR_SYSTEM_NOT_READY               = 802,
+
+    /**
+     * This error indicates that there is a mismatch between the versions of
+     * the display driver and the CUDA driver. Refer to the compatibility documentation
+     * for supported versions.
+     */
+    CUDA_ERROR_SYSTEM_DRIVER_MISMATCH         = 803,
+
+    /**
+     * This error indicates that the system was upgraded to run with forward compatibility
+     * but the visible hardware detected by CUDA does not support this configuration.
+     * Refer to the compatibility documentation for the supported hardware matrix or ensure
+     * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
+     * environment variable.
+     */
+    CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804,
+
+    /**
+     * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
+     */
+    CUDA_ERROR_MPS_CONNECTION_FAILED          = 805,
+
+    /**
+     * This error indicates that the remote procedural call between the MPS server and the MPS client failed.
+     */
+    CUDA_ERROR_MPS_RPC_FAILURE                = 806,
+
+    /**
+     * This error indicates that the MPS server is not ready to accept new MPS client requests.
+     * This error can be returned when the MPS server is in the process of recovering from a fatal failure.
+     */
+    CUDA_ERROR_MPS_SERVER_NOT_READY           = 807,
+
+    /**
+     * This error indicates that the hardware resources required to create MPS client have been exhausted.
+     */
+    CUDA_ERROR_MPS_MAX_CLIENTS_REACHED        = 808,
+
+    /**
+     * This error indicates the the hardware resources required to support device connections have been exhausted.
+     */
+    CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED    = 809,
+
+    /**
+     * This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.
+     */
+    CUDA_ERROR_MPS_CLIENT_TERMINATED          = 810,
+
+    /**
+     * This error indicates that the module is using CUDA Dynamic Parallelism, but the current configuration, like MPS, does not support it.
+     */
+    CUDA_ERROR_CDP_NOT_SUPPORTED              = 811,
+
+    /**
+     * This error indicates that a module contains an unsupported interaction between different versions of CUDA Dynamic Parallelism.
+     */
+    CUDA_ERROR_CDP_VERSION_MISMATCH           = 812,
+
+    /**
+     * This error indicates that the operation is not permitted when
+     * the stream is capturing.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED     = 900,
+
+    /**
+     * This error indicates that the current capture sequence on the stream
+     * has been invalidated due to a previous error.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_INVALIDATED     = 901,
+
+    /**
+     * This error indicates that the operation would have resulted in a merge
+     * of two independent capture sequences.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_MERGE           = 902,
+
+    /**
+     * This error indicates that the capture was not initiated in this stream.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNMATCHED       = 903,
+
+    /**
+     * This error indicates that the capture sequence contains a fork that was
+     * not joined to the primary stream.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNJOINED        = 904,
+
+    /**
+     * This error indicates that a dependency would have been created which
+     * crosses the capture sequence boundary. Only implicit in-stream ordering
+     * dependencies are allowed to cross the boundary.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_ISOLATION       = 905,
+
+    /**
+     * This error indicates a disallowed implicit dependency on a current capture
+     * sequence from cudaStreamLegacy.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_IMPLICIT        = 906,
+
+    /**
+     * This error indicates that the operation is not permitted on an event which
+     * was last recorded in a capturing stream.
+     */
+    CUDA_ERROR_CAPTURED_EVENT                 = 907,
+
+    /**
+     * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
+     * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
+     * different thread.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD    = 908,
+
+    /**
+     * This error indicates that the timeout specified for the wait operation has lapsed.
+     */
+    CUDA_ERROR_TIMEOUT                        = 909,
+
+    /**
+     * This error indicates that the graph update was not performed because it included 
+     * changes which violated constraints specific to instantiated graph update.
+     */
+    CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE      = 910,
+
+    /**
+     * This indicates that an async error has occurred in a device outside of CUDA.
+     * If CUDA was waiting for an external device's signal before consuming shared data,
+     * the external device signaled an error indicating that the data is not valid for
+     * consumption. This leaves the process in an inconsistent state and any further CUDA
+     * work will return the same error. To continue using CUDA, the process must be
+     * terminated and relaunched.
+     */
+    CUDA_ERROR_EXTERNAL_DEVICE               = 911,
+
+    /**
+     * Indicates a kernel launch error due to cluster misconfiguration.
+     */
+    CUDA_ERROR_INVALID_CLUSTER_SIZE           = 912,
+
+    /**
+     * Indiciates a function handle is not loaded when calling an API that requires
+     * a loaded function.
+    */
+    CUDA_ERROR_FUNCTION_NOT_LOADED            = 913,
+
+    /**
+     * This error indicates one or more resources passed in are not valid resource
+     * types for the operation.
+    */
+    CUDA_ERROR_INVALID_RESOURCE_TYPE          = 914,
+
+    /**
+     * This error indicates one or more resources are insufficient or non-applicable for
+     * the operation.
+    */
+    CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION = 915,
+
+    /**
+     * This error indicates that an error happened during the key rotation
+     * sequence.
+    */
+    CUDA_ERROR_KEY_ROTATION                   = 916,
+
+    /**
+     * This indicates that an unknown internal error has occurred.
+     */
+    CUDA_ERROR_UNKNOWN                        = 999
+} CUresult;
+
+/**
+ * P2P Attributes
+ */
+typedef enum CUdevice_P2PAttribute_enum {
+    CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK                     = 0x01,  /**< A relative value indicating the performance of the link between two devices */
+    CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED                     = 0x02,  /**< P2P Access is enable */
+    CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED              = 0x03,  /**< Atomic operation over the link supported */
+    CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED              = 0x04,  /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */
+    CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED          = 0x04   /**< Accessing CUDA arrays over the link supported */
+} CUdevice_P2PAttribute;
+
+/**
+ * CUDA stream callback
+ * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback.  May be NULL.
+ * \param status ::CUDA_SUCCESS or any persistent error on the stream.
+ * \param userData User parameter provided at registration.
+ */
+typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData);
+
+/**
+ * Block size to per-block dynamic shared memory mapping for a certain
+ * kernel \param blockSize Block size of the kernel.
+ *
+ * \return The dynamic shared memory needed by a block.
+ */
+typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize);
+
+/**
+ * If set, host memory is portable between CUDA contexts.
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_PORTABLE        0x01
+
+/**
+ * If set, host memory is mapped into CUDA address space and
+ * ::cuMemHostGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_DEVICEMAP       0x02
+
+/**
+ * If set, host memory is allocated as write-combined - fast to write,
+ * faster to DMA, slow to read except via SSE4 streaming load instruction
+ * (MOVNTDQA).
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_WRITECOMBINED   0x04
+
+/**
+ * If set, host memory is portable between CUDA contexts.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_PORTABLE     0x01
+
+/**
+ * If set, host memory is mapped into CUDA address space and
+ * ::cuMemHostGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_DEVICEMAP    0x02
+
+/**
+ * If set, the passed memory pointer is treated as pointing to some
+ * memory-mapped I/O space, e.g. belonging to a third-party PCIe device.
+ * On Windows the flag is a no-op.
+ * On Linux that memory is marked as non cache-coherent for the GPU and
+ * is expected to be physically contiguous. It may return
+ * ::CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user,
+ * ::CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions.
+ * On all other platforms, it is not supported and ::CUDA_ERROR_NOT_SUPPORTED
+ * is returned.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_IOMEMORY     0x04
+
+/**
+* If set, the passed memory pointer is treated as pointing to memory that is
+* considered read-only by the device.  On platforms without
+* ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
+* required in order to register memory mapped to the CPU as read-only.  Support
+* for the use of this flag can be queried from the device attribute
+* ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag with
+* a current context associated with a device that does not have this attribute
+* set will cause ::cuMemHostRegister to error with ::CUDA_ERROR_NOT_SUPPORTED.
+*/
+#define CU_MEMHOSTREGISTER_READ_ONLY    0x08
+
+/**
+ * 2D memory copy parameters
+ */
+typedef struct CUDA_MEMCPY2D_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+
+    size_t WidthInBytes;        /**< Width of 2D memory copy in bytes */
+    size_t Height;              /**< Height of 2D memory copy */
+} CUDA_MEMCPY2D_v2;
+typedef CUDA_MEMCPY2D_v2 CUDA_MEMCPY2D;
+
+/**
+ * 3D memory copy parameters
+ */
+typedef struct CUDA_MEMCPY3D_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+    size_t srcZ;                /**< Source Z */
+    size_t srcLOD;              /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    void *reserved0;            /**< Must be NULL */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+    size_t dstZ;                /**< Destination Z */
+    size_t dstLOD;              /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    void *reserved1;            /**< Must be NULL */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
+    size_t Height;              /**< Height of 3D memory copy */
+    size_t Depth;               /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D_v2;
+typedef CUDA_MEMCPY3D_v2 CUDA_MEMCPY3D;
+
+/**
+ * 3D memory cross-context copy parameters
+ */
+typedef struct CUDA_MEMCPY3D_PEER_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+    size_t srcZ;                /**< Source Z */
+    size_t srcLOD;              /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    CUcontext srcContext;       /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+    size_t dstZ;                /**< Destination Z */
+    size_t dstLOD;              /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    CUcontext dstContext;       /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
+    size_t Height;              /**< Height of 3D memory copy */
+    size_t Depth;               /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D_PEER_v1;
+typedef CUDA_MEMCPY3D_PEER_v1 CUDA_MEMCPY3D_PEER;
+
+/**
+ * Memcpy node parameters
+ */
+typedef struct CUDA_MEMCPY_NODE_PARAMS_st {
+    int flags;                 /**< Must be zero */
+    int reserved;              /**< Must be zero */
+    CUcontext copyCtx;         /**< Context on which to run the node */
+    CUDA_MEMCPY3D copyParams;  /**< Parameters for the memory copy */
+} CUDA_MEMCPY_NODE_PARAMS;
+
+/**
+ * Array descriptor
+ */
+typedef struct CUDA_ARRAY_DESCRIPTOR_st
+{
+    size_t Width;             /**< Width of array */
+    size_t Height;            /**< Height of array */
+
+    CUarray_format Format;    /**< Array format */
+    unsigned int NumChannels; /**< Channels per array element */
+} CUDA_ARRAY_DESCRIPTOR_v2;
+typedef CUDA_ARRAY_DESCRIPTOR_v2 CUDA_ARRAY_DESCRIPTOR;
+
+/**
+ * 3D array descriptor
+ */
+typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
+{
+    size_t Width;             /**< Width of 3D array */
+    size_t Height;            /**< Height of 3D array */
+    size_t Depth;             /**< Depth of 3D array */
+
+    CUarray_format Format;    /**< Array format */
+    unsigned int NumChannels; /**< Channels per array element */
+    unsigned int Flags;       /**< Flags */
+} CUDA_ARRAY3D_DESCRIPTOR_v2;
+typedef CUDA_ARRAY3D_DESCRIPTOR_v2 CUDA_ARRAY3D_DESCRIPTOR;
+
+/**
+ * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers
+ */
+#define CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL 0x1
+
+/**
+ * CUDA array sparse properties
+ */
+typedef struct CUDA_ARRAY_SPARSE_PROPERTIES_st {
+    struct {
+        unsigned int width;     /**< Width of sparse tile in elements */
+        unsigned int height;    /**< Height of sparse tile in elements */
+        unsigned int depth;     /**< Depth of sparse tile in elements */
+    } tileExtent;
+
+    /**
+     * First mip level at which the mip tail begins.
+     */
+    unsigned int miptailFirstLevel;
+    /**
+     * Total size of the mip tail.
+     */
+    unsigned long long miptailSize;
+    /**
+     * Flags will either be zero or ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL
+     */
+    unsigned int flags;
+    unsigned int reserved[4];
+} CUDA_ARRAY_SPARSE_PROPERTIES_v1;
+typedef CUDA_ARRAY_SPARSE_PROPERTIES_v1 CUDA_ARRAY_SPARSE_PROPERTIES;
+
+/**
+ * CUDA array memory requirements
+ */
+typedef struct CUDA_ARRAY_MEMORY_REQUIREMENTS_st {
+    size_t size;                /**< Total required memory size */
+    size_t alignment;           /**< alignment requirement */
+    unsigned int reserved[4];
+} CUDA_ARRAY_MEMORY_REQUIREMENTS_v1;
+typedef CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 CUDA_ARRAY_MEMORY_REQUIREMENTS;
+
+/**
+ * CUDA Resource descriptor
+ */
+typedef struct CUDA_RESOURCE_DESC_st
+{
+    CUresourcetype resType;                   /**< Resource type */
+
+    union {
+        struct {
+            CUarray hArray;                   /**< CUDA array */
+        } array;
+        struct {
+            CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */
+        } mipmap;
+        struct {
+            CUdeviceptr devPtr;               /**< Device pointer */
+            CUarray_format format;            /**< Array format */
+            unsigned int numChannels;         /**< Channels per array element */
+            size_t sizeInBytes;               /**< Size in bytes */
+        } linear;
+        struct {
+            CUdeviceptr devPtr;               /**< Device pointer */
+            CUarray_format format;            /**< Array format */
+            unsigned int numChannels;         /**< Channels per array element */
+            size_t width;                     /**< Width of the array in elements */
+            size_t height;                    /**< Height of the array in elements */
+            size_t pitchInBytes;              /**< Pitch between two rows in bytes */
+        } pitch2D;
+        struct {
+            int reserved[32];
+        } reserved;
+    } res;
+
+    unsigned int flags;                       /**< Flags (must be zero) */
+} CUDA_RESOURCE_DESC_v1;
+typedef CUDA_RESOURCE_DESC_v1 CUDA_RESOURCE_DESC;
+
+/**
+ * Texture descriptor
+ */
+typedef struct CUDA_TEXTURE_DESC_st {
+    CUaddress_mode addressMode[3];  /**< Address modes */
+    CUfilter_mode filterMode;       /**< Filter mode */
+    unsigned int flags;             /**< Flags */
+    unsigned int maxAnisotropy;     /**< Maximum anisotropy ratio */
+    CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */
+    float mipmapLevelBias;          /**< Mipmap level bias */
+    float minMipmapLevelClamp;      /**< Mipmap minimum level clamp */
+    float maxMipmapLevelClamp;      /**< Mipmap maximum level clamp */
+    float borderColor[4];           /**< Border Color */
+    int reserved[12];
+} CUDA_TEXTURE_DESC_v1;
+typedef CUDA_TEXTURE_DESC_v1 CUDA_TEXTURE_DESC;
+
+/**
+ * Resource view format
+ */
+typedef enum CUresourceViewFormat_enum
+{
+    CU_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
+    CU_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X8      = 0x03, /**< 4 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X8      = 0x04, /**< 1 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X8      = 0x05, /**< 2 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X8      = 0x06, /**< 4 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X16     = 0x07, /**< 1 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X16     = 0x08, /**< 2 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X16     = 0x09, /**< 4 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X16     = 0x0a, /**< 1 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X16     = 0x0b, /**< 2 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X16     = 0x0c, /**< 4 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X32     = 0x0d, /**< 1 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X32     = 0x0e, /**< 2 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X32     = 0x0f, /**< 4 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X32     = 0x10, /**< 1 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X32     = 0x11, /**< 2 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X32     = 0x12, /**< 4 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_FLOAT_1X16    = 0x13, /**< 1 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X16    = 0x14, /**< 2 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X16    = 0x15, /**< 4 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_1X32    = 0x16, /**< 1 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X32    = 0x17, /**< 2 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X32    = 0x18, /**< 4 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC1  = 0x19, /**< Block compressed 1 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC2  = 0x1a, /**< Block compressed 2 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC3  = 0x1b, /**< Block compressed 3 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC4  = 0x1c, /**< Block compressed 4 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC4    = 0x1d, /**< Block compressed 4 signed */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC5  = 0x1e, /**< Block compressed 5 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC5    = 0x1f, /**< Block compressed 5 signed */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
+    CU_RES_VIEW_FORMAT_SIGNED_BC6H   = 0x21, /**< Block compressed 6 signed half-float */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC7  = 0x22  /**< Block compressed 7 */
+} CUresourceViewFormat;
+
+/**
+ * Resource view descriptor
+ */
+typedef struct CUDA_RESOURCE_VIEW_DESC_st
+{
+    CUresourceViewFormat format;   /**< Resource view format */
+    size_t width;                  /**< Width of the resource view */
+    size_t height;                 /**< Height of the resource view */
+    size_t depth;                  /**< Depth of the resource view */
+    unsigned int firstMipmapLevel; /**< First defined mipmap level */
+    unsigned int lastMipmapLevel;  /**< Last defined mipmap level */
+    unsigned int firstLayer;       /**< First layer index */
+    unsigned int lastLayer;        /**< Last layer index */
+    unsigned int reserved[16];
+} CUDA_RESOURCE_VIEW_DESC_v1;
+typedef CUDA_RESOURCE_VIEW_DESC_v1 CUDA_RESOURCE_VIEW_DESC;
+
+/**
+ * Size of tensor map descriptor
+ */
+#define CU_TENSOR_MAP_NUM_QWORDS 16
+
+/**
+ * Tensor map descriptor. Requires compiler support for aligning to 64 bytes.
+ */
+typedef struct CUtensorMap_st {
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+    alignas(64)
+#elif __STDC_VERSION__ >= 201112L
+    _Alignas(64)
+#endif
+    cuuint64_t opaque[CU_TENSOR_MAP_NUM_QWORDS];
+} CUtensorMap;
+
+/**
+ * Tensor map data type
+ */
+typedef enum CUtensorMapDataType_enum {
+    CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0,
+    CU_TENSOR_MAP_DATA_TYPE_UINT16,
+    CU_TENSOR_MAP_DATA_TYPE_UINT32,
+    CU_TENSOR_MAP_DATA_TYPE_INT32,
+    CU_TENSOR_MAP_DATA_TYPE_UINT64,
+    CU_TENSOR_MAP_DATA_TYPE_INT64,
+    CU_TENSOR_MAP_DATA_TYPE_FLOAT16,
+    CU_TENSOR_MAP_DATA_TYPE_FLOAT32,
+    CU_TENSOR_MAP_DATA_TYPE_FLOAT64,
+    CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
+    CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,
+    CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,
+    CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ,
+    CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,
+    CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,
+    CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B
+} CUtensorMapDataType;
+
+/**
+ * Tensor map interleave layout type
+ */
+typedef enum CUtensorMapInterleave_enum {
+    CU_TENSOR_MAP_INTERLEAVE_NONE = 0,
+    CU_TENSOR_MAP_INTERLEAVE_16B,
+    CU_TENSOR_MAP_INTERLEAVE_32B
+} CUtensorMapInterleave;
+
+/**
+ * Tensor map swizzling mode of shared memory banks
+ */
+typedef enum CUtensorMapSwizzle_enum {
+    CU_TENSOR_MAP_SWIZZLE_NONE = 0,
+    CU_TENSOR_MAP_SWIZZLE_32B,
+    CU_TENSOR_MAP_SWIZZLE_64B,
+    CU_TENSOR_MAP_SWIZZLE_128B,
+    CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,
+    CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B,
+    CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B
+} CUtensorMapSwizzle;
+
+/**
+ * Tensor map L2 promotion type
+ */
+typedef enum CUtensorMapL2promotion_enum {
+    CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
+    CU_TENSOR_MAP_L2_PROMOTION_L2_64B,
+    CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
+    CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+} CUtensorMapL2promotion;
+
+/**
+ * Tensor map out-of-bounds fill type
+ */
+typedef enum CUtensorMapFloatOOBfill_enum {
+    CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
+    CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+} CUtensorMapFloatOOBfill;
+
+/**
+ * Tensor map Im2Col wide mode
+ */
+typedef enum CUtensorMapIm2ColWideMode_enum {
+    CU_TENSOR_MAP_IM2COL_WIDE_MODE_W = 0,
+    CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
+} CUtensorMapIm2ColWideMode;
+
+/**
+ * GPU Direct v3 tokens
+ */
+typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st {
+    unsigned long long p2pToken;
+    unsigned int vaSpaceToken;
+} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1;
+typedef CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1 CUDA_POINTER_ATTRIBUTE_P2P_TOKENS;
+
+/**
+* Access flags that specify the level of access the current context's device has
+* on the memory referenced.
+*/
+typedef enum CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS_enum {
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_NONE      = 0x0,   /**< No access, meaning the device cannot access this memory at all, thus must be staged through accessible memory in order to complete certain operations */
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READ      = 0x1,   /**< Read-only access, meaning writes to this memory are considered invalid accesses and thus return error in that case. */
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READWRITE = 0x3    /**< Read-write access, the device has full read-write access to the memory */
+} CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS;
+
+/**
+ * Kernel launch parameters
+ */
+typedef struct CUDA_LAUNCH_PARAMS_st {
+    CUfunction function;         /**< Kernel to launch */
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    CUstream hStream;            /**< Stream identifier */
+    void **kernelParams;         /**< Array of pointers to kernel parameters */
+} CUDA_LAUNCH_PARAMS_v1;
+typedef CUDA_LAUNCH_PARAMS_v1 CUDA_LAUNCH_PARAMS;
+
+/**
+ * External memory handle types
+ */
+typedef enum CUexternalMemoryHandleType_enum {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD          = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32       = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT   = 3,
+    /**
+     * Handle is a D3D12 heap object
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP         = 4,
+    /**
+     * Handle is a D3D12 committed resource
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE     = 5,
+    /**
+     * Handle is a shared NT handle to a D3D11 resource
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE     = 6,
+    /**
+     * Handle is a globally shared handle to a D3D11 resource
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
+    /**
+     * Handle is an NvSciBuf object
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8,
+} CUexternalMemoryHandleType;
+
+/**
+ * Indicates that the external memory object is a dedicated resource
+ */
+#define CUDA_EXTERNAL_MEMORY_DEDICATED   0x1
+
+/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS
+ * contains this flag, it indicates that signaling an external semaphore object
+ * should skip performing appropriate memory synchronization operations over all
+ * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF,
+ * which otherwise are performed by default to ensure data coherency with other
+ * importers of the same NvSciBuf memory objects.
+ */
+#define CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC 0x01
+
+/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS
+ * contains this flag, it indicates that waiting on an external semaphore object
+ * should skip performing appropriate memory synchronization operations over all
+ * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF,
+ * which otherwise are performed by default to ensure data coherency with other
+ * importers of the same NvSciBuf memory objects.
+ */
+#define CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC 0x02
+
+/**
+ * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this,
+ * it indicates that application needs signaler specific NvSciSyncAttr
+ * to be filled by ::cuDeviceGetNvSciSyncAttributes.
+ */
+#define CUDA_NVSCISYNC_ATTR_SIGNAL 0x1
+
+/**
+ * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this,
+ * it indicates that application needs waiter specific NvSciSyncAttr
+ * to be filled by ::cuDeviceGetNvSciSyncAttributes.
+ */
+#define CUDA_NVSCISYNC_ATTR_WAIT 0x2
+/**
+ * External memory handle descriptor
+ */
+typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
+    /**
+     * Type of the handle
+     */
+    CUexternalMemoryHandleType type;
+    union {
+        /**
+         * File descriptor referencing the memory object. Valid
+         * when type is
+         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is one of the following:
+         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid memory object.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+        /**
+         * A handle representing an NvSciBuf Object. Valid when type
+         * is ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF
+         */
+        const void *nvSciBufObject;
+    } handle;
+    /**
+     * Size of the memory allocation
+     */
+    unsigned long long size;
+    /**
+     * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1;
+typedef CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
+
+/**
+ * External memory buffer descriptor
+ */
+typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
+    /**
+     * Offset into the memory object where the buffer's base is
+     */
+    unsigned long long offset;
+    /**
+     * Size of the buffer
+     */
+    unsigned long long size;
+    /**
+     * Flags reserved for future use. Must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1;
+typedef CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
+
+/**
+ * External memory mipmap descriptor
+ */
+typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
+    /**
+     * Offset into the memory object where the base level of the
+     * mipmap chain is.
+     */
+    unsigned long long offset;
+    /**
+     * Format, dimension and type of base level of the mipmap chain
+     */
+    CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+    /**
+     * Total number of levels in the mipmap chain
+     */
+    unsigned int numLevels;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1;
+typedef CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
+
+/**
+ * External semaphore handle types
+ */
+typedef enum CUexternalSemaphoreHandleType_enum {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD             = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32          = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT      = 3,
+    /**
+     * Handle is a shared NT handle referencing a D3D12 fence object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE           = 4,
+    /**
+     * Handle is a shared NT handle referencing a D3D11 fence object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE           = 5,
+    /**
+     * Opaque handle to NvSciSync Object
+	 */
+	CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC             = 6,
+    /**
+     * Handle is a shared NT handle referencing a D3D11 keyed mutex object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX     = 7,
+    /**
+     * Handle is a globally shared handle referencing a D3D11 keyed mutex object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8,
+    /**
+     * Handle is an opaque file descriptor referencing a timeline semaphore
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9,
+    /**
+     * Handle is an opaque shared NT handle referencing a timeline semaphore
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10
+} CUexternalSemaphoreHandleType;
+
+/**
+ * External semaphore handle descriptor
+ */
+typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
+    /**
+     * Type of the handle
+     */
+    CUexternalSemaphoreHandleType type;
+    union {
+        /**
+         * File descriptor referencing the semaphore object. Valid
+         * when type is one of the following:
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is one of the following:
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid synchronization primitive.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+        /**
+         * Valid NvSciSyncObj. Must be non NULL
+         */
+        const void* nvSciSyncObj;
+    } handle;
+    /**
+     * Flags reserved for the future. Must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1;
+typedef CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
+
+/**
+ * External semaphore signal parameters
+ */
+typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st {
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be signaled
+             */
+            unsigned long long value;
+        } fence;
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::CUexternalSemaphoreHandleType
+             * is of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /**
+             * Value of key to release the mutex with
+             */
+            unsigned long long key;
+        } keyedMutex;
+        unsigned int reserved[12];
+    } params;
+    /**
+     * Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to
+     * signal a ::CUexternalSemaphore of type
+     * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
+     * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which indicates
+     * that while signaling the ::CUexternalSemaphore, no memory synchronization
+     * operations should be performed for any external memory object imported
+     * as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
+     * For all other types of ::CUexternalSemaphore, flags must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1;
+typedef CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS;
+
+/**
+ * External semaphore wait parameters
+ */
+typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be waited on
+             */
+            unsigned long long value;
+        } fence;
+        /**
+         * Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType
+         * is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC.
+         */
+        union {
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /**
+             * Value of key to acquire the mutex with
+             */
+            unsigned long long key;
+            /**
+             * Timeout in milliseconds to wait to acquire the mutex
+             */
+            unsigned int timeoutMs;
+        } keyedMutex;
+        unsigned int reserved[10];
+    } params;
+    /**
+     * Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on
+     * a ::CUexternalSemaphore of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
+     * the valid flag is ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
+     * which indicates that while waiting for the ::CUexternalSemaphore, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
+     * For all other types of ::CUexternalSemaphore, flags must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1;
+typedef CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS;
+
+/**
+ * Semaphore signal node parameters
+ */
+typedef struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st {
+    CUexternalSemaphore* extSemArray;                         /**< Array of external semaphore handles. */
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray; /**< Array of external semaphore signal parameters. */
+    unsigned int numExtSems;                                  /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+} CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1;
+typedef CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 CUDA_EXT_SEM_SIGNAL_NODE_PARAMS;
+
+/**
+ * Semaphore signal node parameters
+ */
+typedef struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2_st {
+    CUexternalSemaphore* extSemArray;                         /**< Array of external semaphore handles. */
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray; /**< Array of external semaphore signal parameters. */
+    unsigned int numExtSems;                                  /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+} CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2;
+
+/**
+ * Semaphore wait node parameters
+ */
+typedef struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_st {
+    CUexternalSemaphore* extSemArray;                       /**< Array of external semaphore handles. */
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray; /**< Array of external semaphore wait parameters. */
+    unsigned int numExtSems;                                /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+} CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1;
+typedef CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 CUDA_EXT_SEM_WAIT_NODE_PARAMS;
+
+/**
+ * Semaphore wait node parameters
+ */
+typedef struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2_st {
+    CUexternalSemaphore* extSemArray;                       /**< Array of external semaphore handles. */
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray; /**< Array of external semaphore wait parameters. */
+    unsigned int numExtSems;                                /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+} CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2;
+
+typedef unsigned long long CUmemGenericAllocationHandle_v1;
+typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle;
+
+/**
+ * Flags for specifying particular handle types
+ */
+typedef enum CUmemAllocationHandleType_enum {
+    CU_MEM_HANDLE_TYPE_NONE                  = 0x0,  /**< Does not allow any export mechanism. > */
+    CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1,  /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */
+    CU_MEM_HANDLE_TYPE_WIN32                 = 0x2,  /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */
+    CU_MEM_HANDLE_TYPE_WIN32_KMT             = 0x4,  /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */
+    CU_MEM_HANDLE_TYPE_FABRIC                = 0x8,  /**< Allows a fabric handle to be used for exporting. (CUmemFabricHandle)*/
+    CU_MEM_HANDLE_TYPE_MAX                   = 0x7FFFFFFF
+} CUmemAllocationHandleType;
+
+/**
+ * Specifies the memory protection flags for mapping.
+ */
+typedef enum CUmemAccess_flags_enum {
+    CU_MEM_ACCESS_FLAGS_PROT_NONE        = 0x0,  /**< Default, make the address range not accessible */
+    CU_MEM_ACCESS_FLAGS_PROT_READ        = 0x1,  /**< Make the address range read accessible */
+    CU_MEM_ACCESS_FLAGS_PROT_READWRITE   = 0x3,  /**< Make the address range read-write accessible */
+    CU_MEM_ACCESS_FLAGS_PROT_MAX         = 0x7FFFFFFF
+} CUmemAccess_flags;
+
+/**
+ * Specifies the type of location
+ */
+typedef enum CUmemLocationType_enum {
+    CU_MEM_LOCATION_TYPE_INVALID    = 0x0,
+    CU_MEM_LOCATION_TYPE_DEVICE     = 0x1,  /**< Location is a device location, thus id is a device ordinal */
+    CU_MEM_LOCATION_TYPE_HOST       = 0x2,   /**< Location is host, id is ignored */
+    CU_MEM_LOCATION_TYPE_HOST_NUMA  = 0x3,  /**< Location is a host NUMA node, thus id is a host NUMA node id */
+    CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT = 0x4,  /**< Location is a host NUMA node of the current thread, id is ignored */
+    CU_MEM_LOCATION_TYPE_MAX        = 0x7FFFFFFF
+} CUmemLocationType;
+
+/**
+* Defines the allocation types available
+*/
+typedef enum CUmemAllocationType_enum {
+    CU_MEM_ALLOCATION_TYPE_INVALID = 0x0,
+
+    /** This allocation type is 'pinned', i.e. cannot migrate from its current
+      * location while the application is actively using it
+      */
+    CU_MEM_ALLOCATION_TYPE_PINNED  = 0x1,
+    CU_MEM_ALLOCATION_TYPE_MAX     = 0x7FFFFFFF
+} CUmemAllocationType;
+
+/**
+* Flag for requesting different optimal and required granularities for an allocation.
+*/
+typedef enum CUmemAllocationGranularity_flags_enum {
+    CU_MEM_ALLOC_GRANULARITY_MINIMUM     = 0x0,     /**< Minimum required granularity for allocation */
+    CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1      /**< Recommended granularity for allocation for best performance */
+} CUmemAllocationGranularity_flags;
+
+/**
+* Specifies the handle type for address range
+*/
+typedef enum CUmemRangeHandleType_enum
+{
+    CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD = 0x1,
+    CU_MEM_RANGE_HANDLE_TYPE_MAX        = 0x7FFFFFFF
+} CUmemRangeHandleType;
+
+/**
+* Flag for requesting handle type for address range.
+*/
+typedef enum CUmemRangeFlags_enum {
+    CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE     = 0x1   /**< Indicates that DMA_BUF handle should be mapped via PCIe BAR1 */
+} CUmemRangeFlags;
+
+/**
+ * Sparse subresource types
+ */
+typedef enum CUarraySparseSubresourceType_enum {
+    CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0,
+    CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1
+} CUarraySparseSubresourceType;
+
+/**
+ * Memory operation types
+ */
+typedef enum CUmemOperationType_enum {
+    CU_MEM_OPERATION_TYPE_MAP = 1,
+    CU_MEM_OPERATION_TYPE_UNMAP = 2
+} CUmemOperationType;
+
+/**
+ * Memory handle types
+ */
+typedef enum CUmemHandleType_enum {
+    CU_MEM_HANDLE_TYPE_GENERIC = 0
+} CUmemHandleType;
+
+/**
+ * Specifies the CUDA array or CUDA mipmapped array memory mapping information
+ */
+typedef struct CUarrayMapInfo_st {    
+    CUresourcetype resourceType;                    /**< Resource type */
+
+    union {
+        CUmipmappedArray mipmap;
+        CUarray array;
+    } resource;
+
+    CUarraySparseSubresourceType subresourceType;   /**< Sparse subresource type */
+
+    union {
+        struct {
+            unsigned int level;                     /**< For CUDA mipmapped arrays must a valid mipmap level. For CUDA arrays must be zero */            
+            unsigned int layer;                     /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */
+            unsigned int offsetX;                   /**< Starting X offset in elements */
+            unsigned int offsetY;                   /**< Starting Y offset in elements */
+            unsigned int offsetZ;                   /**< Starting Z offset in elements */            
+            unsigned int extentWidth;               /**< Width in elements */
+            unsigned int extentHeight;              /**< Height in elements */
+            unsigned int extentDepth;               /**< Depth in elements */
+        } sparseLevel;
+        struct {
+            unsigned int layer;                     /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */
+            unsigned long long offset;              /**< Offset within mip tail */
+            unsigned long long size;                /**< Extent in bytes */
+        } miptail;
+    } subresource;
+    
+    CUmemOperationType memOperationType;            /**< Memory operation type */
+    CUmemHandleType memHandleType;                  /**< Memory handle type */
+
+    union {
+        CUmemGenericAllocationHandle memHandle;
+    } memHandle;
+    
+    unsigned long long offset;                      /**< Offset within the memory */
+    unsigned int deviceBitMask;                     /**< Device ordinal bit mask */
+    unsigned int flags;                             /**< flags for future use, must be zero now. */
+    unsigned int reserved[2];                       /**< Reserved for future use, must be zero now. */
+} CUarrayMapInfo_v1;
+typedef CUarrayMapInfo_v1 CUarrayMapInfo;
+
+/**
+ * Specifies a memory location.
+ */
+typedef struct CUmemLocation_st {
+    CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */
+    int id;                 /**< identifier for a given this location's ::CUmemLocationType. */
+} CUmemLocation_v1;
+typedef CUmemLocation_v1 CUmemLocation;
+
+/**
+ * Specifies compression attribute for an allocation.
+ */
+typedef enum CUmemAllocationCompType_enum {
+    CU_MEM_ALLOCATION_COMP_NONE = 0x0, /**< Allocating non-compressible memory */
+    CU_MEM_ALLOCATION_COMP_GENERIC = 0x1 /**< Allocating  compressible memory */
+} CUmemAllocationCompType;
+
+/**
+ * This flag if set indicates that the memory will be used as a tile pool.
+ */
+#define CU_MEM_CREATE_USAGE_TILE_POOL    0x1
+/**
+ * This flag, if set, indicates that the memory will be used as a buffer for
+ * hardware accelerated decompression.
+ */
+#define CU_MEM_CREATE_USAGE_HW_DECOMPRESS 0x2
+
+/**
+* Specifies the allocation properties for a allocation.
+*/
+typedef struct CUmemAllocationProp_st {
+    /** Allocation type */
+    CUmemAllocationType type;
+    /** requested ::CUmemAllocationHandleType */
+    CUmemAllocationHandleType requestedHandleTypes;
+    /** Location of allocation */
+    CUmemLocation location;
+    /**
+     * Windows-specific POBJECT_ATTRIBUTES required when
+     * ::CU_MEM_HANDLE_TYPE_WIN32 is specified.  This object attributes structure
+     * includes security attributes that define
+     * the scope of which exported allocations may be transferred to other
+     * processes.  In all other cases, this field is required to be zero.
+     */
+    void *win32HandleMetaData;
+    struct {
+         /**
+         * Allocation hint for requesting compressible memory.
+         * On devices that support Compute Data Compression, compressible
+         * memory can be used to accelerate accesses to data with unstructured
+         * sparsity and other compressible data patterns. Applications are 
+         * expected to query allocation property of the handle obtained with 
+         * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to 
+         * validate if the obtained allocation is compressible or not. Note that 
+         * compressed memory may not be mappable on all devices.
+         */
+         unsigned char compressionType;
+         unsigned char gpuDirectRDMACapable;
+         /** Bitmask indicating intended usage for this allocation */
+         unsigned short usage;
+         unsigned char reserved[4];
+    } allocFlags;
+} CUmemAllocationProp_v1;
+typedef CUmemAllocationProp_v1 CUmemAllocationProp;
+
+/**
+* Flags for querying different granularities for a multicast object
+*/
+typedef enum CUmulticastGranularity_flags_enum {
+    CU_MULTICAST_GRANULARITY_MINIMUM     = 0x0,     /**< Minimum required granularity */
+    CU_MULTICAST_GRANULARITY_RECOMMENDED = 0x1      /**< Recommended granularity for best performance */
+} CUmulticastGranularity_flags;
+
+/**
+* Specifies the properties for a multicast object.
+*/
+typedef struct CUmulticastObjectProp_st {
+    /**
+     * The number of devices in the multicast team that will bind memory to this
+     * object
+     */
+    unsigned int numDevices;
+    /** 
+     * The maximum amount of memory that can be bound to this multicast object
+     * per device
+     */
+    size_t size;
+    /**
+     * Bitmask of exportable handle types (see ::CUmemAllocationHandleType) for
+     * this object
+     */
+    unsigned long long handleTypes;
+    /** 
+     * Flags for future use, must be zero now
+     */
+    unsigned long long flags;
+} CUmulticastObjectProp_v1;
+typedef CUmulticastObjectProp_v1 CUmulticastObjectProp;
+
+/**
+ * Memory access descriptor
+ */
+typedef struct CUmemAccessDesc_st {
+    CUmemLocation location;        /**< Location on which the request is to change it's accessibility */
+    CUmemAccess_flags flags;       /**< ::CUmemProt accessibility flags to set on the request */
+} CUmemAccessDesc_v1;
+typedef CUmemAccessDesc_v1 CUmemAccessDesc;
+
+/**
+ * CUDA Graph Update error types
+ */
+typedef enum CUgraphExecUpdateResult_enum {
+    CU_GRAPH_EXEC_UPDATE_SUCCESS                     = 0x0, /**< The update succeeded */
+    CU_GRAPH_EXEC_UPDATE_ERROR                       = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */
+    CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED      = 0x2, /**< The update failed because the topology changed */
+    CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED     = 0x3, /**< The update failed because a node type changed */
+    CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED      = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */
+    CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED    = 0x5, /**< The update failed because the parameters changed in a way that is not supported */
+    CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED         = 0x6, /**< The update failed because something about the node is not supported */
+    CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE = 0x7, /**< The update failed because the function of a kernel node changed in an unsupported way */
+    CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED    = 0x8  /**< The update failed because the node attributes changed in a way that is not supported */
+} CUgraphExecUpdateResult;
+
+/**
+ * Result information returned by cuGraphExecUpdate
+ */
+typedef struct CUgraphExecUpdateResultInfo_st {
+    /**
+     * Gives more specific detail when a cuda graph update fails.
+     */
+    CUgraphExecUpdateResult result;
+
+    /**
+     * The "to node" of the error edge when the topologies do not match.
+     * The error node when the error is associated with a specific node.
+     * NULL when the error is generic.
+     */
+    CUgraphNode errorNode;
+
+    /**
+     * The from node of error edge when the topologies do not match. Otherwise NULL.
+     */
+    CUgraphNode errorFromNode;
+} CUgraphExecUpdateResultInfo_v1; 
+typedef CUgraphExecUpdateResultInfo_v1 CUgraphExecUpdateResultInfo;
+
+/**
+ * CUDA memory pool attributes
+ */
+typedef enum CUmemPool_attribute_enum {
+    /**
+     * (value type = int)
+     * Allow cuMemAllocAsync to use memory asynchronously freed
+     * in another streams as long as a stream ordering dependency
+     * of the allocating stream on the free action exists.
+     * Cuda events and null stream interactions can create the required
+     * stream ordered dependencies. (default enabled)
+     */
+    CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 1,
+
+    /**
+     * (value type = int)
+     * Allow reuse of already completed frees when there is no dependency
+     * between the free and allocation. (default enabled)
+     */
+    CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC,
+
+    /**
+     * (value type = int)
+     * Allow cuMemAllocAsync to insert new stream dependencies
+     * in order to establish the stream ordering required to reuse
+     * a piece of memory released by cuFreeAsync (default enabled).
+     */
+    CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of reserved memory in bytes to hold onto before trying
+     * to release memory back to the OS. When more than the release
+     * threshold bytes of memory are held by the memory pool, the
+     * allocator will try to release memory back to the OS on the
+     * next call to stream, event or context synchronize. (default 0)
+     */
+    CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of backing memory currently allocated for the mempool.
+     */
+    CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of backing memory allocated for the mempool since the
+     * last time it was reset. High watermark can only be reset to zero.
+     */
+    CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory from the pool that is currently in use by the application.
+     */
+    CU_MEMPOOL_ATTR_USED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of the amount of memory from the pool that was in use by the application since
+     * the last time it was reset. High watermark can only be reset to zero.
+     */
+    CU_MEMPOOL_ATTR_USED_MEM_HIGH
+} CUmemPool_attribute;
+
+/**
+ * This flag, if set, indicates that the memory will be used as a buffer for
+ * hardware accelerated decompression.
+ */
+#define CU_MEM_POOL_CREATE_USAGE_HW_DECOMPRESS 0x2
+
+/**
+ * Specifies the properties of allocations made from the pool.
+ */
+typedef struct CUmemPoolProps_st {
+    CUmemAllocationType allocType;         /**< Allocation type. Currently must be specified as CU_MEM_ALLOCATION_TYPE_PINNED */
+    CUmemAllocationHandleType handleTypes; /**< Handle types that will be supported by allocations from the pool. */
+    CUmemLocation location;                /**< Location where allocations should reside. */
+    /**
+     * Windows-specific LPSECURITYATTRIBUTES required when
+     * ::CU_MEM_HANDLE_TYPE_WIN32 is specified.  This security attribute defines
+     * the scope of which exported allocations may be transferred to other
+     * processes.  In all other cases, this field is required to be zero.
+     */
+    void *win32SecurityAttributes;
+    size_t maxSize;             /**< Maximum pool size. When set to 0, defaults to a system dependent value. */
+    unsigned short usage;       /**< Bitmask indicating intended usage for the pool. */
+    unsigned char reserved[54]; /**< reserved for future use, must be 0 */
+} CUmemPoolProps_v1;
+typedef CUmemPoolProps_v1 CUmemPoolProps;
+
+/**
+ * Opaque data for exporting a pool allocation
+ */
+typedef struct CUmemPoolPtrExportData_st {
+    unsigned char reserved[64];
+} CUmemPoolPtrExportData_v1;
+typedef CUmemPoolPtrExportData_v1 CUmemPoolPtrExportData;
+
+/**
+ * Memory allocation node parameters
+ */
+typedef struct CUDA_MEM_ALLOC_NODE_PARAMS_v1_st {
+    /**
+    * in: location where the allocation should reside (specified in ::location).
+    * ::handleTypes must be ::CU_MEM_HANDLE_TYPE_NONE. IPC is not supported.
+    */
+    CUmemPoolProps poolProps;
+    const CUmemAccessDesc *accessDescs; /**< in: array of memory access descriptors. Used to describe peer GPU access */
+    size_t accessDescCount; /**< in: number of memory access descriptors.  Must not exceed the number of GPUs. */
+    size_t bytesize; /**< in: size in bytes of the requested allocation */
+    CUdeviceptr dptr; /**< out: address of the allocation returned by CUDA */
+} CUDA_MEM_ALLOC_NODE_PARAMS_v1;
+typedef CUDA_MEM_ALLOC_NODE_PARAMS_v1 CUDA_MEM_ALLOC_NODE_PARAMS;
+
+/**
+ * Memory allocation node parameters
+ */
+typedef struct CUDA_MEM_ALLOC_NODE_PARAMS_v2_st {
+    /**
+    * in: location where the allocation should reside (specified in ::location).
+    * ::handleTypes must be ::CU_MEM_HANDLE_TYPE_NONE. IPC is not supported.
+    */
+    CUmemPoolProps poolProps;
+    const CUmemAccessDesc *accessDescs; /**< in: array of memory access descriptors. Used to describe peer GPU access */
+    size_t accessDescCount; /**< in: number of memory access descriptors.  Must not exceed the number of GPUs. */
+    size_t bytesize; /**< in: size in bytes of the requested allocation */
+    CUdeviceptr dptr; /**< out: address of the allocation returned by CUDA */
+} CUDA_MEM_ALLOC_NODE_PARAMS_v2;
+
+/**
+ * Memory free node parameters
+ */
+typedef struct CUDA_MEM_FREE_NODE_PARAMS_st {
+    CUdeviceptr dptr; /**< in: the pointer to free */
+} CUDA_MEM_FREE_NODE_PARAMS;
+
+typedef enum CUgraphMem_attribute_enum {
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory, in bytes, currently associated with graphs
+     */
+    CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of memory, in bytes, associated with graphs since the
+     * last time it was reset.  High watermark can only be reset to zero.
+     */
+    CU_GRAPH_MEM_ATTR_USED_MEM_HIGH,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory, in bytes, currently allocated for use by
+     * the CUDA graphs asynchronous allocator.
+     */
+    CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of memory, in bytes, currently allocated for use by
+     * the CUDA graphs asynchronous allocator.
+     */
+    CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH
+} CUgraphMem_attribute;
+
+/**
+ * Child graph node parameters
+ */
+typedef struct CUDA_CHILD_GRAPH_NODE_PARAMS_st {
+    CUgraph graph; /**< The child graph to clone into the node for node creation, or
+                        a handle to the graph owned by the node for node query */
+} CUDA_CHILD_GRAPH_NODE_PARAMS;
+
+/**
+ * Event record node parameters
+ */
+typedef struct CUDA_EVENT_RECORD_NODE_PARAMS_st {
+    CUevent event; /**< The event to record when the node executes */
+} CUDA_EVENT_RECORD_NODE_PARAMS;
+
+/**
+ * Event wait node parameters
+ */
+typedef struct CUDA_EVENT_WAIT_NODE_PARAMS_st {
+    CUevent event; /**< The event to wait on from the node */
+} CUDA_EVENT_WAIT_NODE_PARAMS;
+
+/**
+ * Graph node parameters.  See ::cuGraphAddNode.
+ */
+typedef struct CUgraphNodeParams_st {
+    CUgraphNodeType type; /**< Type of the node */
+    int reserved0[3]; /**< Reserved. Must be zero. */
+
+    union {
+        long long                             reserved1[29]; /**< Padding. Unused bytes must be zero. */
+        CUDA_KERNEL_NODE_PARAMS_v3            kernel;        /**< Kernel node parameters. */
+        CUDA_MEMCPY_NODE_PARAMS               memcpy;        /**< Memcpy node parameters. */
+        CUDA_MEMSET_NODE_PARAMS_v2            memset;        /**< Memset node parameters. */
+        CUDA_HOST_NODE_PARAMS_v2              host;          /**< Host node parameters. */
+        CUDA_CHILD_GRAPH_NODE_PARAMS          graph;         /**< Child graph node parameters. */
+        CUDA_EVENT_WAIT_NODE_PARAMS           eventWait;     /**< Event wait node parameters. */
+        CUDA_EVENT_RECORD_NODE_PARAMS         eventRecord;   /**< Event record node parameters. */
+        CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2    extSemSignal;  /**< External semaphore signal node parameters. */
+        CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2      extSemWait;    /**< External semaphore wait node parameters. */
+        CUDA_MEM_ALLOC_NODE_PARAMS_v2         alloc;         /**< Memory allocation node parameters. */
+        CUDA_MEM_FREE_NODE_PARAMS             free;          /**< Memory free node parameters. */
+        CUDA_BATCH_MEM_OP_NODE_PARAMS_v2      memOp;         /**< MemOp node parameters. */
+        CUDA_CONDITIONAL_NODE_PARAMS          conditional;   /**< Conditional node parameters. */
+    };
+
+    long long reserved2; /**< Reserved bytes. Must be zero. */
+} CUgraphNodeParams;
+
+/**
+ * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only
+ * waits for prior work in the stream corresponding to that GPU to complete before the
+ * kernel begins execution.
+ */
+#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC   0x01
+
+/**
+ * If set, any subsequent work pushed in a stream that participated in a call to
+ * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on
+ * the GPU corresponding to that stream to complete before it begins execution.
+ */
+#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC  0x02
+
+/**
+ * If set, the CUDA array is a collection of layers, where each layer is either a 1D
+ * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
+ * of layers, not the depth of a 3D array.
+ */
+#define CUDA_ARRAY3D_LAYERED        0x01
+
+/**
+ * Deprecated, use CUDA_ARRAY3D_LAYERED
+ */
+#define CUDA_ARRAY3D_2DARRAY        0x01
+
+/**
+ * This flag must be set in order to bind a surface reference
+ * to the CUDA array
+ */
+#define CUDA_ARRAY3D_SURFACE_LDST   0x02
+
+/**
+ * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The
+ * width of such a CUDA array must be equal to its height, and Depth must be six.
+ * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps
+ * and Depth must be a multiple of six.
+ */
+#define CUDA_ARRAY3D_CUBEMAP        0x04
+
+/**
+ * This flag must be set in order to perform texture gather operations
+ * on a CUDA array.
+ */
+#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08
+
+/**
+ * This flag if set indicates that the CUDA
+ * array is a DEPTH_TEXTURE.
+ */
+#define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10
+
+/**
+ * This flag indicates that the CUDA array may be bound as a color target
+ * in an external graphics API
+ */
+#define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20
+
+/**
+ * This flag if set indicates that the CUDA array or CUDA mipmapped array
+ * is a sparse CUDA array or CUDA mipmapped array respectively
+ */
+#define CUDA_ARRAY3D_SPARSE 0x40
+
+/**
+ * This flag if set indicates that the CUDA array or CUDA mipmapped array
+ * will allow deferred memory mapping
+ */
+#define CUDA_ARRAY3D_DEFERRED_MAPPING 0x80
+
+/**
+ * This flag indicates that the CUDA array will be used for hardware accelerated
+ * video encode/decode operations.
+ */
+#define CUDA_ARRAY3D_VIDEO_ENCODE_DECODE 0x100
+
+/**
+ * Override the texref format with a format inferred from the array.
+ * Flag for ::cuTexRefSetArray()
+ */
+#define CU_TRSA_OVERRIDE_FORMAT 0x01
+
+/**
+ * Read the texture as integers rather than promoting the values to floats
+ * in the range [0,1].
+ * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
+ */
+#define CU_TRSF_READ_AS_INTEGER         0x01
+
+/**
+ * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
+ * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
+ */
+#define CU_TRSF_NORMALIZED_COORDINATES  0x02
+
+/**
+ * Perform sRGB->linear conversion during texture read.
+ * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
+ */
+#define CU_TRSF_SRGB  0x10
+
+ /**
+  * Disable any trilinear filtering optimizations.
+  * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
+  */
+#define CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION  0x20
+
+/**
+ * Enable seamless cube map filtering.
+ * Flag for ::cuTexObjectCreate()
+ */
+#define CU_TRSF_SEAMLESS_CUBEMAP  0x40
+
+/**
+ * C++ compile time constant for CU_LAUNCH_PARAM_END
+ */
+#define CU_LAUNCH_PARAM_END_AS_INT     0x00
+
+/**
+ * End of array terminator for the \p extra parameter to
+ * ::cuLaunchKernel
+ */
+#define CU_LAUNCH_PARAM_END            ((void*)CU_LAUNCH_PARAM_END_AS_INT)
+
+/**
+ * C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_POINTER
+ */
+#define CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT 0x01
+
+/**
+ * Indicator that the next value in the \p extra parameter to
+ * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
+ * parameters used for launching kernel \p f.  This buffer needs to
+ * honor all alignment/padding requirements of the individual parameters.
+ * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
+ * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
+ * effect.
+ */
+#define CU_LAUNCH_PARAM_BUFFER_POINTER        ((void*)CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT)
+
+/**
+ * C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_SIZE
+ */
+#define CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT 0x02
+
+/**
+ * Indicator that the next value in the \p extra parameter to
+ * ::cuLaunchKernel will be a pointer to a size_t which contains the
+ * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
+ * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
+ * in the \p extra array if the value associated with
+ * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
+ */
+#define CU_LAUNCH_PARAM_BUFFER_SIZE        ((void*)CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT)
+
+/**
+ * For texture references loaded into the module, use default texunit from
+ * texture reference.
+ */
+#define CU_PARAM_TR_DEFAULT -1
+
+/**
+ * Device that represents the CPU
+ */
+#define CU_DEVICE_CPU               ((CUdevice)-1)
+
+/**
+ * Device that represents an invalid device
+ */
+#define CU_DEVICE_INVALID           ((CUdevice)-2)
+
+/**
+ * Bitmasks for ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS
+ */
+typedef enum CUflushGPUDirectRDMAWritesOptions_enum {
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST   = 1<<0, /**< ::cuFlushGPUDirectRDMAWrites() and its CUDA Runtime API counterpart are supported on the device. */
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_MEMOPS = 1<<1  /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. */
+} CUflushGPUDirectRDMAWritesOptions;
+
+/**
+ * Platform native ordering for GPUDirect RDMA writes
+ */
+typedef enum CUGPUDirectRDMAWritesOrdering_enum {
+    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE        = 0,   /**< The device does not natively support ordering of remote writes. ::cuFlushGPUDirectRDMAWrites() can be leveraged if supported. */
+    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_OWNER       = 100, /**< Natively, the device can consistently consume remote writes, although other CUDA devices may not. */
+    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_ALL_DEVICES = 200  /**< Any CUDA device in the system can consistently consume remote writes to this device. */
+} CUGPUDirectRDMAWritesOrdering;
+
+/**
+ * The scopes for ::cuFlushGPUDirectRDMAWrites
+ */
+typedef enum CUflushGPUDirectRDMAWritesScope_enum {
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER       = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES = 200  /**< Blocks until remote writes are visible to all CUDA device contexts. */
+} CUflushGPUDirectRDMAWritesScope;
+ 
+/**
+ * The targets for ::cuFlushGPUDirectRDMAWrites
+ */
+typedef enum CUflushGPUDirectRDMAWritesTarget_enum {
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX = 0 /**< Sets the target for ::cuFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */
+} CUflushGPUDirectRDMAWritesTarget;
+
+/**
+ * The additional write options for ::cuGraphDebugDotPrint
+ */
+typedef enum CUgraphDebugDot_flags_enum {
+    CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE                        = 1<<0,  /**< Output all debug data as if every debug flag is enabled */
+    CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES                  = 1<<1,  /**< Use CUDA Runtime structures for output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS             = 1<<2,  /**< Adds CUDA_KERNEL_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS             = 1<<3,  /**< Adds CUDA_MEMCPY3D values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS             = 1<<4,  /**< Adds CUDA_MEMSET_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS               = 1<<5,  /**< Adds CUDA_HOST_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS              = 1<<6,  /**< Adds CUevent handle from record and wait nodes to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS   = 1<<7,  /**< Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS     = 1<<8,  /**< Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES         = 1<<9,  /**< Adds CUkernelNodeAttrValue values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES                        = 1<<10, /**< Adds node handles and every kernel function handle to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS          = 1<<11, /**< Adds memory alloc node parameters to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS           = 1<<12, /**< Adds memory free node parameters to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS       = 1<<13, /**< Adds batch mem op node parameters to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_EXTRA_TOPO_INFO                = 1<<14, /**< Adds edge numbering information */
+    CU_GRAPH_DEBUG_DOT_FLAGS_CONDITIONAL_NODE_PARAMS        = 1<<15  /**< Adds conditional node parameters to output */
+} CUgraphDebugDot_flags;
+
+/**
+ * Flags for user objects for graphs
+ */
+typedef enum CUuserObject_flags_enum {
+    CU_USER_OBJECT_NO_DESTRUCTOR_SYNC = 1  /**< Indicates the destructor execution is not synchronized by any CUDA handle. */
+} CUuserObject_flags;
+
+/**
+ * Flags for retaining user object references for graphs
+ */
+typedef enum CUuserObjectRetain_flags_enum {
+    CU_GRAPH_USER_OBJECT_MOVE = 1  /**< Transfer references from the caller rather than creating new references. */
+} CUuserObjectRetain_flags;
+
+/**
+ * Flags for instantiating a graph
+ */
+typedef enum CUgraphInstantiate_flags_enum {
+    CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH  = 1 /**< Automatically free memory allocated in a graph before relaunching. */
+  , CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD               = 2 /**< Automatically upload the graph after instantiation. Only supported by
+                                                              ::cuGraphInstantiateWithParams.  The upload will be performed using the
+                                                              stream provided in \p instantiateParams. */
+  , CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH        = 4 /**< Instantiate the graph to be launchable from the device. This flag can only
+                                                              be used on platforms which support unified addressing. This flag cannot be
+                                                              used in conjunction with CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH. */
+  , CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY    = 8 /**< Run the graph using the per-node priority attributes rather than the
+                                                              priority of the stream it is launched into. */
+} CUgraphInstantiate_flags;
+
+/**
+ * CUDA device NUMA configuration
+ */
+typedef enum CUdeviceNumaConfig_enum {
+    CU_DEVICE_NUMA_CONFIG_NONE = 0, /**< The GPU is not a NUMA node */
+    CU_DEVICE_NUMA_CONFIG_NUMA_NODE, /**< The GPU is a NUMA node, CU_DEVICE_ATTRIBUTE_NUMA_ID contains its NUMA ID */
+} CUdeviceNumaConfig;
+
+/**
+ * CUDA Process States
+ */
+typedef enum CUprocessState_enum {
+    CU_PROCESS_STATE_RUNNING = 0,  /**< Default process state */
+    CU_PROCESS_STATE_LOCKED,       /**< CUDA API locks are taken so further CUDA API calls will block */
+    CU_PROCESS_STATE_CHECKPOINTED, /**< Application memory contents have been checkpointed and underlying allocations and device handles have been released */
+    CU_PROCESS_STATE_FAILED,       /**< Application entered an uncorrectable error during the checkpoint/restore process */
+} CUprocessState;
+
+/**
+ * CUDA checkpoint optional lock arguments
+ */
+typedef struct CUcheckpointLockArgs_st {
+    unsigned int timeoutMs; /**< Timeout in milliseconds to attempt to lock the process, 0 indicates no timeout */
+    unsigned int reserved0; /**< Reserved for future use, must be zero */
+    cuuint64_t reserved1[7]; /**< Reserved for future use, must be zeroed */
+} CUcheckpointLockArgs;
+
+/**
+ * CUDA checkpoint optional checkpoint arguments
+ */
+typedef struct CUcheckpointCheckpointArgs_st {
+    cuuint64_t reserved[8]; /**< Reserved for future use, must be zeroed */
+} CUcheckpointCheckpointArgs;
+
+/**
+ * CUDA checkpoint optional restore arguments
+ */
+typedef struct CUcheckpointRestoreArgs_st {
+    cuuint64_t reserved[8]; /**< Reserved for future use, must be zeroed */
+} CUcheckpointRestoreArgs;
+
+/**
+ * CUDA checkpoint optional unlock arguments
+ */
+typedef struct CUcheckpointUnlockArgs_st {
+    cuuint64_t reserved[8]; /**< Reserved for future use, must be zeroed */
+} CUcheckpointUnlockArgs;
+
+/**
+ * Flags to specify for copies within a batch. For more details see ::cuMemcpyBatchAsync.
+ */
+typedef enum CUmemcpyFlags_enum {
+    CU_MEMCPY_FLAG_DEFAULT = 0x0,
+
+    /**
+     * Hint to the driver to try and overlap the copy with compute work on the SMs.
+     */
+    CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE = 0x1
+} CUmemcpyFlags;
+
+/**
+ * These flags allow applications to convey the source access ordering CUDA must maintain.
+ * The destination will always be accessed in stream order.
+ */
+typedef enum CUmemcpySrcAccessOrder_enum {
+    /**
+     * Default invalid.
+     */
+    CU_MEMCPY_SRC_ACCESS_ORDER_INVALID = 0x0,
+
+    /**
+     * Indicates that access to the source pointer must be in stream order.
+     */
+    CU_MEMCPY_SRC_ACCESS_ORDER_STREAM = 0x1,
+
+    /**
+     * Indicates that access to the source pointer can be out of stream order and
+     * all accesses must be complete before the API call returns. This flag is suited for
+     * ephemeral sources (ex., stack variables) when it's known that no prior operations
+     * in the stream can be accessing the memory and also that the lifetime of the memory
+     * is limited to the scope that the source variable was declared in. Specifying
+     * this flag allows the driver to optimize the copy and removes the need for the user
+     * to synchronize the stream after the API call.
+     */
+    CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL = 0x2,
+
+    /**
+     * Indicates that access to the source pointer can be out of stream order and the accesses
+     * can happen even after the API call returns. This flag is suited for host pointers
+     * allocated outside CUDA (ex., via malloc) when it's known that no prior operations
+     * in the stream can be accessing the memory. Specifying this flag allows the driver
+     * to optimize the copy on certain platforms.
+     */
+    CU_MEMCPY_SRC_ACCESS_ORDER_ANY = 0x3,
+
+    CU_MEMCPY_SRC_ACCESS_ORDER_MAX = 0x7FFFFFFF
+}  CUmemcpySrcAccessOrder;
+
+/**
+ * Attributes specific to copies within a batch. For more details on usage see ::cuMemcpyBatchAsync.
+ */
+typedef struct CUmemcpyAttributes_st {
+    CUmemcpySrcAccessOrder srcAccessOrder;  /**< Source access ordering to be observed for copies with this attribute. */
+    CUmemLocation srcLocHint;               /**< Hint location for the source operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
+    CUmemLocation dstLocHint;               /**< Hint location for the destination operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
+    unsigned int flags;                     /**< Additional flags for copies with this attribute. See ::CUmemcpyFlags */
+} CUmemcpyAttributes_v1;
+typedef CUmemcpyAttributes_v1 CUmemcpyAttributes;
+
+/**
+ * These flags allow applications to convey the operand type for individual copies specified in ::cuMemcpy3DBatchAsync.
+ */
+typedef enum CUmemcpy3DOperandType_enum {
+    CU_MEMCPY_OPERAND_TYPE_POINTER = 0x1,     /**< Memcpy operand is a valid pointer. */
+    CU_MEMCPY_OPERAND_TYPE_ARRAY = 0x2,       /**< Memcpy operand is a CUarray. */
+    CU_MEMCPY_OPERAND_TYPE_MAX = 0x7FFFFFFF
+} CUmemcpy3DOperandType;
+
+/**
+ * Struct representing offset into a CUarray in elements
+ */
+typedef struct CUoffset3D_st {
+    size_t x;
+    size_t y;
+    size_t z;
+} CUoffset3D_v1;
+typedef CUoffset3D_v1 CUoffset3D;
+
+/**
+ * Struct representing width/height/depth of a CUarray in elements
+ */
+typedef struct CUextent3D_st {
+    size_t width;
+    size_t height;
+    size_t depth;
+} CUextent3D_v1;
+typedef CUextent3D_v1 CUextent3D;
+
+/**
+ * Struct representing an operand for copy with ::cuMemcpy3DBatchAsync
+ */
+typedef struct CUmemcpy3DOperand_st {
+    CUmemcpy3DOperandType type;
+    union {
+        /**
+         * Struct representing an operand when ::CUmemcpy3DOperand::type is ::CU_MEMCPY_OPERAND_TYPE_POINTER
+         */
+        struct {
+            CUdeviceptr ptr;
+            size_t rowLength;        /**< Length of each row in elements. */
+            size_t layerHeight;      /**< Height of each layer in elements. */ 
+            CUmemLocation locHint;   /**< Hint location for the operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. */
+        } ptr;
+
+        /**
+         * Struct representing an operand when ::CUmemcpy3DOperand::type is ::CU_MEMCPY_OPERAND_TYPE_ARRAY
+         */
+        struct {
+            CUarray array;
+            CUoffset3D offset;
+        } array;
+    } op;
+} CUmemcpy3DOperand_v1;
+typedef CUmemcpy3DOperand_v1 CUmemcpy3DOperand;
+
+typedef struct CUDA_MEMCPY3D_BATCH_OP_st {
+    CUmemcpy3DOperand src;                    /**< Source memcpy operand. */
+    CUmemcpy3DOperand dst;                    /**< Destination memcpy operand. */
+    CUextent3D extent;                        /**< Extents of the memcpy between src and dst. The width, height and depth components must not be 0.*/
+    CUmemcpySrcAccessOrder srcAccessOrder;    /**< Source access ordering to be observed for copy from src to dst. */
+    unsigned int flags;                       /**< Additional flags for copies with this attribute. See ::CUmemcpyFlags */
+} CUDA_MEMCPY3D_BATCH_OP_v1;
+typedef CUDA_MEMCPY3D_BATCH_OP_v1 CUDA_MEMCPY3D_BATCH_OP;
+
+/** @} */ /* END CUDA_TYPES */
+
+#if defined(__GNUC__)
+  #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT)
+    #pragma GCC visibility push(default)
+  #endif
+#endif
+
+#ifdef _WIN32
+#define CUDAAPI __stdcall
+#else
+#define CUDAAPI
+#endif
+
+/**
+ * \defgroup CUDA_ERROR Error Handling
+ *
+ * ___MANBRIEF___ error handling functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the error handling functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Gets the string description of an error code
+ *
+ * Sets \p *pStr to the address of a NULL-terminated string description
+ * of the error code \p error.
+ * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
+ * will be returned and \p *pStr will be set to the NULL address.
+ *
+ * \param error - Error code to convert to string
+ * \param pStr - Address of the string pointer.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::CUresult,
+ * ::cudaGetErrorString
+ */
+CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr);
+
+/**
+ * \brief Gets the string representation of an error code enum name
+ *
+ * Sets \p *pStr to the address of a NULL-terminated string representation
+ * of the name of the enum error code \p error.
+ * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
+ * will be returned and \p *pStr will be set to the NULL address.
+ *
+ * \param error - Error code to convert to string
+ * \param pStr - Address of the string pointer.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::CUresult,
+ * ::cudaGetErrorName
+ */
+CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr);
+
+/** @} */ /* END CUDA_ERROR */
+
+/**
+ * \defgroup CUDA_INITIALIZE Initialization
+ *
+ * ___MANBRIEF___ initialization functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the initialization functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Initialize the CUDA driver API
+ * Initializes the driver API and must be called before any other function from
+ * the driver API in the current process. Currently, the \p Flags parameter must be 0. If ::cuInit()
+ * has not been called, any function from the driver API will return
+ * ::CUDA_ERROR_NOT_INITIALIZED.
+ *
+ * \param Flags - Initialization flag for CUDA.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH,
+ * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE
+ * \notefnerr
+ */
+CUresult CUDAAPI cuInit(unsigned int Flags);
+
+/** @} */ /* END CUDA_INITIALIZE */
+
+/**
+ * \defgroup CUDA_VERSION Version Management
+ *
+ * ___MANBRIEF___ version management functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the version management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the latest CUDA version supported by driver
+ *
+ * Returns in \p *driverVersion the version of CUDA supported by
+ * the driver.  The version is returned as
+ * (1000 &times; major + 10 &times; minor). For example, CUDA 9.2
+ * would be represented by 9020.
+ *
+ * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if
+ * \p driverVersion is NULL.
+ *
+ * \param driverVersion - Returns the CUDA driver version
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaDriverGetVersion,
+ * ::cudaRuntimeGetVersion
+ */
+CUresult CUDAAPI cuDriverGetVersion(int *driverVersion);
+
+/** @} */ /* END CUDA_VERSION */
+
+/**
+ * \defgroup CUDA_DEVICE Device Management
+ *
+ * ___MANBRIEF___ device management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the device management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns a handle to a compute device
+ *
+ * Returns in \p *device a device handle given an ordinal in the range <b>[0,
+ * ::cuDeviceGetCount()-1]</b>.
+ *
+ * \param device  - Returned device handle
+ * \param ordinal - Device number to get handle for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport
+ */
+CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
+
+/**
+ * \brief Returns the number of compute-capable devices
+ *
+ * Returns in \p *count the number of devices with compute capability greater
+ * than or equal to 2.0 that are available for execution. If there is no such
+ * device, ::cuDeviceGetCount() returns 0.
+ *
+ * \param count - Returned number of compute-capable devices
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaGetDeviceCount
+ */
+CUresult CUDAAPI cuDeviceGetCount(int *count);
+
+/**
+ * \brief Returns an identifier string for the device
+ *
+ * Returns an ASCII string identifying the device \p dev in the NULL-terminated
+ * string pointed to by \p name. \p len specifies the maximum length of the
+ * string that may be returned.
+ *
+ * \param name - Returned identifier string for the device
+ * \param len  - Maximum length of string to store in \p name
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
+
+/**
+ * \brief Return an UUID for the device
+ *
+ * Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will
+ * supplant this version in 12.0, which is retained for minor version compatibility.
+ *
+ * Returns 16-octets identifying the device \p dev in the structure
+ * pointed by the \p uuid.
+ *
+ * \param uuid - Returned UUID
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetUuid_v2
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev);
+
+/**
+ * \brief Return an UUID for the device (11.4+)
+ *
+ * Returns 16-octets identifying the device \p dev in the structure
+ * pointed by the \p uuid. If the device is in MIG mode, returns its
+ * MIG UUID which uniquely identifies the subscribed MIG compute instance.
+ *
+ * \param uuid - Returned UUID
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev);
+
+/**
+ * \brief Return an LUID and device node mask for the device
+ *
+ * Return identifying information (\p luid and \p deviceNodeMask) to allow
+ * matching device with graphics APIs.
+ *
+ * \param luid - Returned LUID
+ * \param deviceNodeMask - Returned device node mask
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev);
+
+/**
+ * \brief Returns the total amount of memory on the device
+ *
+ * Returns in \p *bytes the total amount of memory available on the device
+ * \p dev in bytes.
+ *
+ * \param bytes - Returned memory available on device in bytes
+ * \param dev   - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaMemGetInfo
+ */
+CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
+
+/**
+ * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size.
+ *
+ * Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture
+ * for given \p format and \p numChannels.
+ *
+ * \param maxWidthInElements    - Returned maximum number of texture elements allocatable for given \p format and \p numChannels.
+ * \param format                - Texture format.
+ * \param numChannels           - Number of channels per texture element.
+ * \param dev                   - Device handle.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cudaMemGetInfo,
+ * ::cuDeviceTotalMem
+ */
+CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev);
+
+/**
+ * \brief Returns information about the device
+ *
+ * Returns in \p *pi the integer value of the attribute \p attrib on device
+ * \p dev. The supported attributes are:
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per
+ *   block;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of
+ *   shared memory available to a thread block in bytes
+ * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for
+ *   __constant__ variables in a CUDA C kernel in bytes
+ * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the
+ *   memory copy functions that involve memory regions allocated through
+ *   ::cuMemAllocPitch()
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D
+ *  texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width
+ *  for a 1D texture bound to linear memory
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum
+ *  mipmapped 1D texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D
+ *  texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D
+ *  texture height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width
+ *  for a 2D texture bound to linear memory
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height
+ *  for a 2D texture bound to linear memory
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch
+ *  in bytes for a 2D texture bound to linear memory
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum
+ *  mipmapped 2D texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum
+ *  mipmapped 2D texture height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D
+ *  texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D
+ *  texture height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D
+ *  texture depth
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE:
+ *  Alternate maximum 3D texture width, 0 if no alternate
+ *  maximum 3D texture size is supported
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE:
+ *  Alternate maximum 3D texture height, 0 if no alternate
+ *  maximum 3D texture size is supported
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE:
+ *  Alternate maximum 3D texture depth, 0 if no alternate
+ *  maximum 3D texture size is supported
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH:
+ *  Maximum cubemap texture width or height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH:
+ *  Maximum 1D layered texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:
+ *   Maximum layers in a 1D layered texture
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH:
+ *  Maximum 2D layered texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:
+ *   Maximum 2D layered texture height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:
+ *   Maximum layers in a 2D layered texture
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH:
+ *   Maximum cubemap layered texture width or height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS:
+ *   Maximum layers in a cubemap layered texture
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH:
+ *   Maximum 1D surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH:
+ *   Maximum 2D surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT:
+ *   Maximum 2D surface height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH:
+ *   Maximum 3D surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT:
+ *   Maximum 3D surface height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH:
+ *   Maximum 3D surface depth
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH:
+ *   Maximum 1D layered surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS:
+ *   Maximum layers in a 1D layered surface
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH:
+ *   Maximum 2D layered surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT:
+ *   Maximum 2D layered surface height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS:
+ *   Maximum layers in a 2D layered surface
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH:
+ *   Maximum cubemap surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH:
+ *   Maximum cubemap layered surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS:
+ *   Maximum layers in a cubemap layered surface
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit
+ *   registers available to a thread block
+ * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz
+ * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture
+ *   base addresses aligned to ::textureAlign bytes do not need an offset
+ *   applied to texture fetches
+ * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement
+ *   for 2D texture references bound to pitched memory
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy
+ *   memory between host and device while executing a kernel, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on
+ *   the device
+ * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit
+ *   for kernels executed on the device, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the
+ *   memory subsystem, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host
+ *   memory into the CUDA address space, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently
+ *   in. Available modes are as follows:
+ *   - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and
+ *     can have multiple CUDA contexts present at a single time.
+ *   - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is
+ *     prohibited from creating new CUDA contexts.
+ *   - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS:  Compute-exclusive-process mode - Device
+ *     can have only one context used by a single process at a time.
+ * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports
+ *   executing multiple kernels within the same context simultaneously, or 0 if
+ *   not. It is not guaranteed that multiple kernels will be resident
+ *   on the device concurrently so this feature should not be relied upon for
+ *   correctness.
+ * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the
+ *    device, 0 if error correction is disabled or not supported by the device
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier
+ *   of the device
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device
+ * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC
+ *    is only available on Tesla hardware running Windows Vista or later
+ * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz
+ * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits
+ * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor
+ * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with
+ *   the host, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number
+ * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals
+ *    in L1 cache, 0 if caching globals in L1 cache is not supported by the device
+ * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals
+ *    in L1 cache, 0 if caching locals in L1 cache is not supported by the device
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of
+ *   shared memory available to a multiprocessor in bytes; this amount is shared
+ *   by all thread blocks simultaneously resident on a multiprocessor
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit
+ *   registers available to a multiprocessor; this number is shared by all thread
+ *   blocks simultaneously resident on a multiprocessor
+ * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory
+ *   on this system, 0 if allocating managed memory is not supported by the device on this system.
+ * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not.
+ * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices
+ *   associated with the same board. Devices on the same multi-GPU board will share the same identifier.
+ * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host
+ *   supports native atomic operations.
+ * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance
+ *   (in floating-point operations per second) to double precision performance.
+ * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device supports coherently accessing
+ *   pageable memory without calling cudaHostRegister on it.
+ * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory
+ *   concurrently with the CPU.
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption.
+ * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered
+ *   memory at the same virtual address as the CPU.
+ * -  ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size
+ *    supported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() or cuKernelSetAttribute() call.
+ *    For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+ * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's
+ *   page tables.
+ * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration.
+ * - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED:  Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs
+ * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED:  Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor
+ * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes 
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate.
+ * - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes
+ * - ::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays. 
+ * - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU
+ * - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information)
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here.
+ * - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC
+ * - ::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED: Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays.
+ * - ::CU_DEVICE_ATTRIBUTE_NUMA_CONFIG: NUMA configuration of a device: value is of type ::CUdeviceNumaConfig enum
+ * - ::CU_DEVICE_ATTRIBUTE_NUMA_ID: NUMA node ID of the GPU memory
+ * - ::CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED: Device supports switch multicast and reduction operations.
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_PCI_DEVICE_ID: The combined 16-bit PCI device ID and 16-bit PCI vendor ID.
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_PCI_SUBSYSTEM_ID: The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID.
+ID.
+ *
+ * \param pi     - Returned device attribute value
+ * \param attrib - Device attribute to query
+ * \param dev    - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaDeviceGetAttribute,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
+
+/**
+ * \brief Return NvSciSync attributes that this device can support.
+ *
+ * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that
+ * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList
+ * can be used to create an NvSciSync object that matches this device's capabilities.
+ * 
+ * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is
+ * already set this API will return ::CUDA_ERROR_INVALID_VALUE.
+ * 
+ * The applications should set \p nvSciSyncAttrList to a valid 
+ * NvSciSyncAttrList failing which this API will return
+ * ::CUDA_ERROR_INVALID_HANDLE.
+ * 
+ * The \p flags controls how applications intends to use
+ * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are:
+ * - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to 
+ * signal an NvSciSync on this CUDA device.
+ * - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to 
+ * wait on an NvSciSync on this CUDA device.
+ *
+ * At least one of these flags must be set, failing which the API
+ * returns ::CUDA_ERROR_INVALID_VALUE. Both the flags are orthogonal
+ * to one another: a developer may set both these flags that allows to
+ * set both wait and signal specific attributes in the same \p nvSciSyncAttrList.
+ *
+ * Note that this API updates the input \p nvSciSyncAttrList with values equivalent
+ * to the following public attribute key-values:
+ * NvSciSyncAttrKey_RequiredPerm is set to
+ * - NvSciSyncAccessPerm_SignalOnly if ::CUDA_NVSCISYNC_ATTR_SIGNAL is set in \p flags.
+ * - NvSciSyncAccessPerm_WaitOnly if ::CUDA_NVSCISYNC_ATTR_WAIT is set in \p flags.
+ * - NvSciSyncAccessPerm_WaitSignal if both ::CUDA_NVSCISYNC_ATTR_WAIT and
+ * ::CUDA_NVSCISYNC_ATTR_SIGNAL are set in \p flags.
+ * NvSciSyncAttrKey_PrimitiveInfo is set to
+ * - NvSciSyncAttrValPrimitiveType_SysmemSemaphore on any valid \p device.
+ * - NvSciSyncAttrValPrimitiveType_Syncpoint if \p device is a Tegra device.
+ * - NvSciSyncAttrValPrimitiveType_SysmemSemaphorePayload64b if \p device is GA10X+.
+ * NvSciSyncAttrKey_GpuId is set to the same UUID that is returned for this 
+ * \p device from ::cuDeviceGetUuid.
+ *
+ * \param nvSciSyncAttrList     - Return NvSciSync attributes supported.
+ * \param dev                   - Valid Cuda Device to get NvSciSync attributes for.
+ * \param flags                 - flags describing NvSciSync usage.
+ *
+ * \return
+ *
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa
+ * ::cuImportExternalSemaphore,
+ * ::cuDestroyExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags);
+
+/**
+ * \brief Sets the current memory pool of a device
+ *
+ * The memory pool must be local to the specified device.
+ * ::cuMemAllocAsync allocates from the current mempool of the provided stream's device.
+ * By default, a device's current memory pool is its default memory pool.
+ *
+ * \note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different
+ * than the one the stream runs on. 
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolDestroy, ::cuMemAllocFromPoolAsync
+ */
+CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool);
+
+/**
+ * \brief Gets the current mempool for a device
+ *
+ * Returns the last pool provided to ::cuDeviceSetMemPool for this device
+ * or the device's default memory pool if ::cuDeviceSetMemPool has never been called.
+ * By default the current mempool is the default mempool for a device.
+ * Otherwise the returned pool must have been set with ::cuDeviceSetMemPool.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate, ::cuDeviceSetMemPool
+ */
+CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev);
+
+/**
+ * \brief Returns the default mempool of a device
+ *
+ * The default mempool of a device contains device memory from that device.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemPoolTrimTo, ::cuMemPoolGetAttribute, ::cuMemPoolSetAttribute, cuMemPoolSetAccess, ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out, CUdevice dev);
+
+/**
+ * \brief Returns information about the execution affinity support of the device.
+ *
+ * Returns in \p *pi whether execution affinity type \p type is supported by device \p dev.
+ * The supported types are:
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device,
+ *   or 0 if not;
+ *
+ * \param pi   - 1 if the execution affinity type \p type is supported by the device, or 0 if not
+ * \param type - Execution affinity type to query
+ * \param dev  - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem
+ */
+CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type, CUdevice dev);
+
+/**
+ * \brief Blocks until remote writes are visible to the specified scope
+ *
+ * Blocks until GPUDirect RDMA writes to the target context via mappings
+ * created through APIs like nvidia_p2p_get_pages (see
+ * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are
+ * visible to the specified scope.
+ *
+ * If the scope equals or lies within the scope indicated by
+ * ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, the call
+ * will be a no-op and can be safely omitted for performance. This can be
+ * determined by comparing the numerical values between the two enums, with
+ * smaller scopes having smaller values.
+ *
+ * On platforms that support GPUDirect RDMA writes via more than one path in
+ * hardware (see ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE), the user should
+ * consider those paths as belonging to separate ordering domains. Note that in
+ * such cases CUDA driver will report both RDMA writes ordering and RDMA write
+ * scope as ALL_DEVICES and a call to cuFlushGPUDirectRDMA will be a no-op,
+ * but when these multiple paths are used simultaneously, it is the user's
+ * responsibility to ensure ordering by using mechanisms outside the scope of
+ * CUDA.
+ *
+ * Users may query support for this API via
+ * ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS.
+ *
+ * \param target - The target of the operation, see ::CUflushGPUDirectRDMAWritesTarget
+ * \param scope  - The scope of the operation, see ::CUflushGPUDirectRDMAWritesScope
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ */
+CUresult CUDAAPI cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope);
+
+/** @} */ /* END CUDA_DEVICE */
+
+/**
+ * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated device management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the device management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns properties for a selected device
+ *
+ * \deprecated
+ *
+ * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute().
+ *
+ * Returns in \p *prop the properties of device \p dev. The ::CUdevprop
+ * structure is defined as:
+ *
+ * \code
+     typedef struct CUdevprop_st {
+     int maxThreadsPerBlock;
+     int maxThreadsDim[3];
+     int maxGridSize[3];
+     int sharedMemPerBlock;
+     int totalConstantMemory;
+     int SIMDWidth;
+     int memPitch;
+     int regsPerBlock;
+     int clockRate;
+     int textureAlign
+  } CUdevprop;
+ * \endcode
+ * where:
+ *
+ * - ::maxThreadsPerBlock is the maximum number of threads per block;
+ * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block;
+ * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid;
+ * - ::sharedMemPerBlock is the total amount of shared memory available per
+ *   block in bytes;
+ * - ::totalConstantMemory is the total amount of constant memory available on
+ *   the device in bytes;
+ * - ::SIMDWidth is the warp size;
+ * - ::memPitch is the maximum pitch allowed by the memory copy functions that
+ *   involve memory regions allocated through ::cuMemAllocPitch();
+ * - ::regsPerBlock is the total number of registers available per block;
+ * - ::clockRate is the clock frequency in kilohertz;
+ * - ::textureAlign is the alignment requirement; texture base addresses that
+ *   are aligned to ::textureAlign bytes do not need an offset applied to
+ *   texture fetches.
+ *
+ * \param prop - Returned properties of device
+ * \param dev  - Device to get properties for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
+
+/**
+ * \brief Returns the compute capability of the device
+ *
+ * \deprecated
+ *
+ * This function was deprecated as of CUDA 5.0 and its functionality superseded
+ * by ::cuDeviceGetAttribute().
+ *
+ * Returns in \p *major and \p *minor the major and minor revision numbers that
+ * define the compute capability of the device \p dev.
+ *
+ * \param major - Major revision number
+ * \param minor - Minor revision number
+ * \param dev   - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
+
+/** @} */ /* END CUDA_DEVICE_DEPRECATED */
+
+/**
+ * \defgroup CUDA_PRIMARY_CTX Primary Context Management
+ *
+ * ___MANBRIEF___ primary context management functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the primary context management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * The primary context is unique per device and shared with the CUDA runtime API.
+ * These functions allow integration with other libraries using CUDA.
+ *
+ * @{
+ */
+
+/**
+ * \brief Retain the primary context on the GPU
+ *
+ * Retains the primary context on the device.
+ * Once the user successfully retains the primary context, the primary context
+ * will be active and available to the user until the user releases it
+ * with ::cuDevicePrimaryCtxRelease() or resets it with ::cuDevicePrimaryCtxReset().
+ * Unlike ::cuCtxCreate() the newly retained context is not pushed onto the stack.
+ *
+ * Retaining the primary context for the first time will fail with ::CUDA_ERROR_UNKNOWN
+ * if the compute mode of the device is ::CU_COMPUTEMODE_PROHIBITED. The function
+ * ::cuDeviceGetAttribute() can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to
+ * determine the compute mode  of the device.
+ * The <i>nvidia-smi</i> tool can be used to set the compute mode for
+ * devices. Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * Please note that the primary context always supports pinned allocations. Other
+ * flags can be specified by ::cuDevicePrimaryCtxSetFlags().
+ *
+ * \param pctx  - Returned context handle of the new context
+ * \param dev   - Device for which primary context is requested
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRelease,
+ * ::cuDevicePrimaryCtxSetFlags,
+ * ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev);
+
+/**
+ * \brief Release the primary context on the GPU
+ *
+ * Releases the primary context interop on the device.
+ * A retained context should always be released once the user is done using
+ * it. The context is automatically reset once the last reference to it is
+ * released. This behavior is different when the primary context was retained
+ * by the CUDA runtime from CUDA 4.0 and earlier. In this case, the primary
+ * context remains always active.
+ *
+ * Releasing a primary context that has not been previously retained will
+ * fail with ::CUDA_ERROR_INVALID_CONTEXT.
+ *
+ * Please note that unlike ::cuCtxDestroy() this method does not pop the context
+ * from stack in any circumstances.
+ *
+ * \param dev - Device which primary context is released
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRetain,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
+
+/**
+ * \brief Set flags for the primary context
+ *
+ * Sets the flags for the primary context on the device overwriting perviously
+ * set ones.
+ *
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ *
+ * - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally
+ * with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can
+ * be set during context creation to instruct CUDA to create a coredump if
+ * this context raises an exception during execution. These environment variables
+ * are described in the CUDA-GDB user guide under the "GPU core dump support"
+ * section.
+ * The initial settings will be taken from the global settings at the time of
+ * context creation. The other settings that control coredump output can be 
+ * modified by calling ::cuCoredumpSetAttribute from the created context after
+ * it becomes current.
+ *
+ * - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not
+ * been enabled globally with ::cuCoredumpSetAttributeGlobal or environment 
+ * variables, this flag can be set during context creation to instruct CUDA to
+ * create a coredump if data is written to a certain pipe that is present in the
+ * OS space. These environment variables are described in the CUDA-GDB user
+ * guide under the "GPU core dump support" section.
+ * It is important to note that the pipe name *must* be set with
+ * ::cuCoredumpSetAttributeGlobal before creating the context if this flag is
+ * used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set.
+ * The initial settings will be taken from the global settings at the time of
+ * context creation. The other settings that control coredump output can be 
+ * modified by calling ::cuCoredumpSetAttribute from the created context after 
+ * it becomes current.
+ *
+ * - ::CU_CTX_SYNC_MEMOPS: Ensures that synchronous memory operations initiated
+ * on this context will always synchronize. See further documentation in the
+ * section titled "API Synchronization behavior" to learn more about cases when
+ * synchronous memory operations can exhibit asynchronous behavior.
+ *
+ * \param dev   - Device for which the primary context flags are set
+ * \param flags - New flags for the device
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRetain,
+ * ::cuDevicePrimaryCtxGetState,
+ * ::cuCtxCreate,
+ * ::cuCtxGetFlags,
+ * ::cuCtxSetFlags,
+ * ::cudaSetDeviceFlags
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
+
+/**
+ * \brief Get the state of the primary context
+ *
+ * Returns in \p *flags the flags for the primary context of \p dev, and in
+ * \p *active whether it is active.  See ::cuDevicePrimaryCtxSetFlags for flag
+ * values.
+ *
+ * \param dev    - Device to get primary context flags for
+ * \param flags  - Pointer to store flags
+ * \param active - Pointer to store context state; 0 = inactive, 1 = active
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDevicePrimaryCtxSetFlags,
+ * ::cuCtxGetFlags,
+ * ::cuCtxSetFlags,
+ * ::cudaGetDeviceFlags
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active);
+
+/**
+ * \brief Destroy all allocations and reset all state on the primary context
+ *
+ * Explicitly destroys and cleans up all resources associated with the current
+ * device in the current process.
+ *
+ * Note that it is responsibility of the calling function to ensure that no
+ * other module in the process is using the device any more. For that reason
+ * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases.
+ * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease()
+ * even after resetting the device.
+ * Resetting the primary context does not release it, an application that has
+ * retained the primary context should explicitly release its usage.
+ *
+ * \param dev - Device for which primary context is destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRetain,
+ * ::cuDevicePrimaryCtxRelease,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceReset
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
+
+/** @} */ /* END CUDA_PRIMARY_CTX */
+
+/**
+ * \defgroup CUDA_CTX Context Management
+ *
+ * ___MANBRIEF___ context management functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the context management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * Please note that some functions are described in
+ * \ref CUDA_PRIMARY_CTX "Primary Context Management" section.
+ *
+ * @{
+ */
+
+/**
+ * \brief Create a CUDA context
+ *
+ * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain.
+ *
+ * Creates a new CUDA context and associates it with the calling thread. The
+ * \p flags parameter is described below. The context is created with a usage
+ * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy()
+ * when done using the context. If a context is already current to the thread,
+ * it is supplanted by the newly created context and may be restored by a subsequent
+ * call to ::cuCtxPopCurrent().
+ *
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
+ *
+ * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
+ * This flag must be set in order to allocate pinned host memory that is
+ * accessible to the GPU.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
+ *
+ * - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally
+ * with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can
+ * be set during context creation to instruct CUDA to create a coredump if
+ * this context raises an exception during execution. These environment variables
+ * are described in the CUDA-GDB user guide under the "GPU core dump support"
+ * section.
+ * The initial attributes will be taken from the global attributes at the time of
+ * context creation. The other attributes that control coredump output can be 
+ * modified by calling ::cuCoredumpSetAttribute from the created context after
+ * it becomes current.
+ *
+ * - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not
+ * been enabled globally with ::cuCoredumpSetAttributeGlobal or environment 
+ * variables, this flag can be set during context creation to instruct CUDA to
+ * create a coredump if data is written to a certain pipe that is present in the
+ * OS space. These environment variables are described in the CUDA-GDB user
+ * guide under the "GPU core dump support" section.
+ * It is important to note that the pipe name *must* be set with
+ * ::cuCoredumpSetAttributeGlobal before creating the context if this flag is
+ * used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set.
+ * The initial attributes will be taken from the global attributes at the time of
+ * context creation. The other attributes that control coredump output can be 
+ * modified by calling ::cuCoredumpSetAttribute from the created context after 
+ * it becomes current.
+ * Setting this flag on any context creation is equivalent to setting the 
+ * ::CU_COREDUMP_ENABLE_USER_TRIGGER attribute to \p true globally.
+ *
+ * - ::CU_CTX_SYNC_MEMOPS: Ensures that synchronous memory operations initiated
+ * on this context will always synchronize. See further documentation in the
+ * section titled "API Synchronization behavior" to learn more about cases when
+ * synchronous memory operations can exhibit asynchronous behavior.
+ *
+ * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
+ * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
+ * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
+ * the compute mode for * devices.
+ * Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * \param pctx  - Returned context handle of the new context
+ * \param flags - Context creation flags
+ * \param dev   - Device to create context on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCoredumpSetAttributeGlobal,
+ * ::cuCoredumpSetAttribute,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
+
+/**
+ * \brief Create a CUDA context with execution affinity
+ *
+ * Creates a new CUDA context with execution affinity and associates it with
+ * the calling thread. The \p paramsArray and \p flags parameter are described below.
+ * The context is created with a usage count of 1 and the caller of ::cuCtxCreate() must
+ * call ::cuCtxDestroy() when done using the context. If a context is already
+ * current to the thread, it is supplanted by the newly created context and may
+ * be restored by a subsequent call to ::cuCtxPopCurrent().
+ *
+ * The type and the amount of execution resource the context can use is limited by \p paramsArray
+ * and \p numParams. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numParams
+ * describes the size of the array. If two \p CUexecAffinityParam in the array have the same type,
+ * the latter execution affinity parameter overrides the former execution affinity parameter.
+ * The supported execution affinity types are:
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion
+ *   of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally
+ *   rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution
+ *   affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute
+ *   is only supported under Volta+ MPS.
+ *
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
+ *
+ * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
+ * This flag must be set in order to allocate pinned host memory that is
+ * accessible to the GPU.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
+ *
+ * - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally
+ * with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can
+ * be set during context creation to instruct CUDA to create a coredump if
+ * this context raises an exception during execution. These environment variables
+ * are described in the CUDA-GDB user guide under the "GPU core dump support"
+ * section.
+ * The initial attributes will be taken from the global attributes at the time of
+ * context creation. The other attributes that control coredump output can be 
+ * modified by calling ::cuCoredumpSetAttribute from the created context after
+ * it becomes current.
+ *
+ * - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not
+ * been enabled globally with ::cuCoredumpSetAttributeGlobal or environment 
+ * variables, this flag can be set during context creation to instruct CUDA to
+ * create a coredump if data is written to a certain pipe that is present in the
+ * OS space. These environment variables are described in the CUDA-GDB user
+ * guide under the "GPU core dump support" section.
+ * It is important to note that the pipe name *must* be set with
+ * ::cuCoredumpSetAttributeGlobal before creating the context if this flag is
+ * used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set.
+ * The initial attributes will be taken from the global attributes at the time of
+ * context creation. The other attributes that control coredump output can be 
+ * modified by calling ::cuCoredumpSetAttribute from the created context after 
+ * it becomes current.
+ * Setting this flag on any context creation is equivalent to setting the 
+ * ::CU_COREDUMP_ENABLE_USER_TRIGGER attribute to \p true globally.
+ *
+ * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
+ * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
+ * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
+ * the compute mode for * devices.
+ * Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * \param pctx        - Returned context handle of the new context
+ * \param paramsArray - Execution affinity parameters
+ * \param numParams   - Number of execution affinity parameters
+ * \param flags       - Context creation flags
+ * \param dev         - Device to create context on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuCoredumpSetAttributeGlobal,
+ * ::cuCoredumpSetAttribute,
+ * ::CUexecAffinityParam
+ */
+CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev);
+
+/**
+ * \brief Create a CUDA context
+ *
+ * Creates a new CUDA context and associates it with the calling thread. The
+ * \p flags parameter is described below. The context is created with a usage
+ * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy()
+ * when done using the context. If a context is already current to the thread,
+ * it is supplanted by the newly created context and may be restored by a subsequent
+ * call to ::cuCtxPopCurrent().
+ * 
+ * CUDA context can be created with execution affinity. The type and the amount of 
+   execution resource the context can use is limited by \p paramsArray and \p numExecAffinityParams
+   in \p execAffinity. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numExecAffinityParams
+ * describes the size of the paramsArray. If two \p CUexecAffinityParam in the array have the same type,
+ * the latter execution affinity parameter overrides the former execution affinity parameter.
+ * The supported execution affinity types are:
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion
+ *   of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally
+ *   rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution
+ *   affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute
+ *   is only supported under Volta+ MPS.
+ * 
+ * CUDA context can be created in CIG(CUDA in Graphics) mode by setting \p cigParams. 
+ * Data from graphics client is shared with CUDA via the \p sharedData in \p cigParams. 
+ * Support for D3D12 graphics client can be determined using ::cuDeviceGetAttribute() with 
+ * ::CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED. \p sharedData is a ID3D12CommandQueue handle.
+ * Either \p execAffinityParams or \p cigParams can be set to a non-null value. Setting both to a 
+ * non-null value will result in an undefined behavior.
+ * 
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
+ *
+ * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
+ * This flag must be set in order to allocate pinned host memory that is
+ * accessible to the GPU.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
+ *
+ * - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally
+ * with ::cuCoredumpSetAttributeGlobal or environment variables, this flag can
+ * be set during context creation to instruct CUDA to create a coredump if
+ * this context raises an exception during execution. These environment variables
+ * are described in the CUDA-GDB user guide under the "GPU core dump support"
+ * section.
+ * The initial attributes will be taken from the global attributes at the time of
+ * context creation. The other attributes that control coredump output can be
+ * modified by calling ::cuCoredumpSetAttribute from the created context after
+ * it becomes current. This flag is not supported when CUDA context is created in
+ * CIG(CUDA in Graphics) mode.
+ *
+ * - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not
+ * been enabled globally with ::cuCoredumpSetAttributeGlobal or environment
+ * variables, this flag can be set during context creation to instruct CUDA to
+ * create a coredump if data is written to a certain pipe that is present in the
+ * OS space. These environment variables are described in the CUDA-GDB user
+ * guide under the "GPU core dump support" section.
+ * It is important to note that the pipe name *must* be set with
+ * ::cuCoredumpSetAttributeGlobal before creating the context if this flag is
+ * used. Setting this flag implies that ::CU_CTX_COREDUMP_ENABLE is set.
+ * The initial attributes will be taken from the global attributes at the time of
+ * context creation. The other attributes that control coredump output can be
+ * modified by calling ::cuCoredumpSetAttribute from the created context after
+ * it becomes current.
+ * Setting this flag on any context creation is equivalent to setting the
+ * ::CU_COREDUMP_ENABLE_USER_TRIGGER attribute to \p true globally.
+ * This flag is not supported when CUDA context is created in
+ * CIG(CUDA in Graphics) mode.
+ *
+ * - ::CU_CTX_SYNC_MEMOPS: Ensures that synchronous memory operations initiated
+ * on this context will always synchronize. See further documentation in the
+ * section titled "API Synchronization behavior" to learn more about cases when
+ * synchronous memory operations can exhibit asynchronous behavior.
+ *
+ * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
+ * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
+ * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
+ * the compute mode for * devices.
+ * Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * Context creation will fail with :: CUDA_ERROR_INVALID_VALUE if invalid parameter was
+ * passed by client to create the CUDA context.
+ * 
+ * Context creation in CIG mode will fail with ::CUDA_ERROR_NOT_SUPPORTED if CIG is not supported
+ * by the device or the driver.
+ * \param pctx              - Returned context handle of the new context
+ * \param ctxCreateParams   - Context creation parameters
+ * \param flags             - Context creation flags
+ * \param dev               - Device to create context on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCoredumpSetAttributeGlobal,
+ * ::cuCoredumpSetAttribute,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxCreate_v4(CUcontext *pctx, CUctxCreateParams *ctxCreateParams, unsigned int flags, CUdevice dev);
+
+/**
+ * \brief Destroy a CUDA context
+ *
+ * Destroys the CUDA context specified by \p ctx.  The context \p ctx will be
+ * destroyed regardless of how many threads it is current to.
+ * It is the responsibility of the calling function to ensure that no API
+ * call issues using \p ctx while ::cuCtxDestroy() is executing.
+ *
+ * Destroys and cleans up all resources associated with the context.
+ * It is the caller's responsibility to ensure that the context or its resources
+ * are not accessed or passed in subsequent API calls and doing so will result in undefined behavior.
+ * These resources include CUDA types ::CUmodule, ::CUfunction, ::CUstream, ::CUevent,
+ * ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref,
+ * ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore.
+ * These resources also include memory allocations by ::cuMemAlloc(), ::cuMemAllocHost(),
+ * ::cuMemAllocManaged() and ::cuMemAllocPitch().
+ *
+ * If \p ctx is current to the calling thread then \p ctx will also be
+ * popped from the current thread's context stack (as though ::cuCtxPopCurrent()
+ * were called).  If \p ctx is current to other threads, then \p ctx will
+ * remain current to those threads, and attempting to access \p ctx from
+ * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED.
+ *
+ * \note ::cuCtxDestroy() will not destroy memory allocations by ::cuMemCreate(), ::cuMemAllocAsync() and
+ * ::cuMemAllocFromPoolAsync(). These memory allocations are not associated with any CUDA context and need to
+ * be destroyed explicitly.
+ *
+ * \param ctx - Context to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
+
+/**
+ * \brief Pushes a context on the current CPU thread
+ *
+ * Pushes the given context \p ctx onto the CPU thread's stack of current
+ * contexts. The specified context becomes the CPU thread's current context, so
+ * all CUDA functions that operate on the current context are affected.
+ *
+ * The previous current context may be made current again by calling
+ * ::cuCtxDestroy() or ::cuCtxPopCurrent().
+ *
+ * \param ctx - Context to push
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
+
+/**
+ * \brief Pops the current CUDA context from the current CPU thread.
+ *
+ * Pops the current CUDA context from the CPU thread and passes back the
+ * old context handle in \p *pctx. That context may then be made current
+ * to a different CPU thread by calling ::cuCtxPushCurrent().
+ *
+ * If a context was current to the CPU thread before ::cuCtxCreate() or
+ * ::cuCtxPushCurrent() was called, this function makes that context current to
+ * the CPU thread again.
+ *
+ * \param pctx - Returned popped context handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
+
+/**
+ * \brief Binds the specified CUDA context to the calling CPU thread
+ *
+ * Binds the specified CUDA context to the calling CPU thread.
+ * If \p ctx is NULL then the CUDA context previously bound to the
+ * calling CPU thread is unbound and ::CUDA_SUCCESS is returned.
+ *
+ * If there exists a CUDA context stack on the calling CPU thread, this
+ * will replace the top of that stack with \p ctx.
+ * If \p ctx is NULL then this will be equivalent to popping the top
+ * of the calling CPU thread's CUDA context stack (or a no-op if the
+ * calling CPU thread's CUDA context stack is empty).
+ *
+ * \param ctx - Context to bind to the calling CPU thread
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxGetCurrent,
+ * ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cudaSetDevice
+ */
+CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx);
+
+/**
+ * \brief Returns the CUDA context bound to the calling CPU thread.
+ *
+ * Returns in \p *pctx the CUDA context bound to the calling CPU thread.
+ * If no context is bound to the calling CPU thread then \p *pctx is
+ * set to NULL and ::CUDA_SUCCESS is returned.
+ *
+ * \param pctx - Returned context handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxSetCurrent,
+ * ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cudaGetDevice
+ */
+CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx);
+
+/**
+ * \brief Returns the device handle for the current context
+ *
+ * Returns in \p *device the handle of the current context's device.
+ *
+ * \param device - Returned device handle for the current context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaGetDevice
+ */
+CUresult CUDAAPI cuCtxGetDevice(CUdevice *device);
+
+/**
+ * \brief Returns the flags for the current context
+ *
+ * Returns in \p *flags the flags of the current context. See ::cuCtxCreate
+ * for flag values.
+ *
+ * \param flags - Pointer to store flags of current context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetCurrent,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetLimit,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuCtxSetFlags,
+ * ::cudaGetDeviceFlags
+ */
+CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags);
+
+/**
+ * \brief Sets the flags for the current context
+ *
+ * Sets the flags for the current context overwriting previously set ones. See
+ * ::cuDevicePrimaryCtxSetFlags for flag values.
+ *
+ * \param flags - Flags to set on the current context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetCurrent,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetLimit,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuCtxGetFlags,
+ * ::cudaGetDeviceFlags,
+ * ::cuDevicePrimaryCtxSetFlags,
+ */
+CUresult CUDAAPI cuCtxSetFlags(unsigned int flags);
+
+/**
+ * \brief Returns the unique Id associated with the context supplied
+ *
+ * Returns in \p ctxId the unique Id which is associated with a given context.
+ * The Id is unique for the life of the program for this instance of CUDA.
+ * If context is supplied as NULL and there is one current, the Id of the
+ * current context is returned.
+ *
+ * \param ctx - Context for which to obtain the Id
+ * \param ctxId - Pointer to store the Id of the context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPushCurrent
+ */
+CUresult CUDAAPI cuCtxGetId(CUcontext ctx, unsigned long long *ctxId);
+
+/**
+ * \brief Block for the current context's tasks to complete
+ *
+ * Blocks until the current context has completed all preceding requested tasks.
+ * If the current context is the primary context, green contexts that have been
+ * created will also be synchronized.
+ * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed.
+ * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the
+ * CPU thread will block until the GPU context has finished its work.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cudaDeviceSynchronize
+ */
+CUresult CUDAAPI cuCtxSynchronize(void);
+
+/**
+ * \brief Set resource limits
+ *
+ * Setting \p limit to \p value is a request by the application to update
+ * the current limit maintained by the context. The driver is free to
+ * modify the requested value to meet h/w requirements (this could be
+ * clamping to minimum or maximum values, rounding up to nearest element
+ * size, etc). The application can use ::cuCtxGetLimit() to find out exactly
+ * what the limit has been set to.
+ *
+ * Setting each ::CUlimit has its own specific restrictions, so each is
+ * discussed here.
+ *
+ * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread.
+ *   The driver automatically increases the per-thread stack size
+ *   for each kernel launch as needed. This size isn't reset back to the
+ *   original value after each launch. Setting this value will take effect 
+ *   immediately, and if necessary, the device will block until all preceding 
+ *   requested tasks are complete.
+ *
+ * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used
+ *   by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE
+ *   must be performed before launching any kernel that uses the ::printf()
+ *   device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used
+ *   by the ::malloc() and ::free() device system calls. Setting
+ *   ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel
+ *   that uses the ::malloc() or ::free() device system calls, otherwise
+ *   ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of
+ *   a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting
+ *   this limit must be performed before any launch of a kernel that uses the
+ *   device runtime and calls ::cudaDeviceSynchronize() above the default sync
+ *   depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail
+ *   with error code ::cudaErrorSyncDepthExceeded if the limitation is
+ *   violated. This limit can be set smaller than the default or up the maximum
+ *   launch depth of 24. When setting this limit, keep in mind that additional
+ *   levels of sync depth require the driver to reserve large amounts of device
+ *   memory which can no longer be used for user allocations. If these
+ *   reservations of device memory fail, ::cuCtxSetLimit() will return
+ *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability < 9.0.
+ *   Attempting to set this limit on devices of other compute capability
+ *   versions will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
+ *   returned.
+ *
+ * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of
+ *   outstanding device runtime launches that can be made from the current
+ *   context. A grid is outstanding from the point of launch up until the grid
+ *   is known to have been completed. Device runtime launches which violate
+ *   this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when
+ *   ::cudaGetLastError() is called after launch. If more pending launches than
+ *   the default (2048 launches) are needed for a module using the device
+ *   runtime, this limit can be increased. Keep in mind that being able to
+ *   sustain additional pending launches will require the driver to reserve
+ *   larger amounts of device memory upfront which can no longer be used for
+ *   allocations. If these reservations fail, ::cuCtxSetLimit() will return
+ *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability 3.5 and
+ *   higher. Attempting to set this limit on devices of compute capability less
+ *   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
+ *   returned.
+ *
+ * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity.
+ *   Values can range from 0B to 128B. This is purely a performance hint and
+ *   it can be ignored or clamped depending on the platform.
+ *
+ * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes available for
+ *   persisting L2 cache. This is purely a performance hint and it can be
+ *   ignored or clamped depending on the platform.
+ *
+ * \param limit - Limit to set
+ * \param value - Size of limit
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNSUPPORTED_LIMIT,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceSetLimit
+ */
+CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value);
+
+/**
+ * \brief Returns resource limits
+ *
+ * Returns in \p *pvalue the current size of \p limit.  The supported
+ * ::CUlimit values are:
+ * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread.
+ * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the
+ *   ::printf() device system call.
+ * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the
+ *   ::malloc() and ::free() device system calls.
+ * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread
+ *   can issue the device runtime call ::cudaDeviceSynchronize() to wait on
+ *   child grid launches to complete.
+ * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding
+ *   device runtime launches that can be made from this context.
+ * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity.
+ * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE: Persisting L2 cache size in bytes
+ *
+ * \param limit  - Limit to query
+ * \param pvalue - Returned size of limit
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNSUPPORTED_LIMIT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceGetLimit
+ */
+CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit);
+
+/**
+ * \brief Returns the preferred cache configuration for the current context.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this function returns through \p pconfig the preferred cache configuration
+ * for the current context. This is only a preference. The driver will use
+ * the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute functions.
+ *
+ * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices
+ * where the size of the L1 cache and shared memory are fixed.
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \param pconfig - Returned cache configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceGetCacheConfig
+ */
+CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig);
+
+/**
+ * \brief Sets the preferred cache configuration for the current context.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p config the preferred cache configuration for
+ * the current context. This is only a preference. The driver will use
+ * the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute the function. Any function preference
+ * set via ::cuFuncSetCacheConfig() or ::cuKernelSetCacheConfig() will be preferred over this context-wide
+ * setting. Setting the context-wide cache configuration to
+ * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer
+ * to not change the cache configuration unless required to launch the kernel.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \param config - Requested cache configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceSetCacheConfig,
+ * ::cuKernelSetCacheConfig
+ */
+CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config);
+
+/**
+ * \brief Gets the context's API version.
+ *
+ * Returns a version number in \p version corresponding to the capabilities of
+ * the context (e.g. 3010 or 3020), which library developers can use to direct
+ * callers to a specific API version. If \p ctx is NULL, returns the API version
+ * used to create the currently bound context.
+ *
+ * Note that new API versions are only introduced when context capabilities are
+ * changed that break binary compatibility, so the API version and driver version
+ * may be different. For example, it is valid for the API version to be 3020 while
+ * the driver version is 4020.
+ *
+ * \param ctx     - Context to check
+ * \param version - Pointer to version
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version);
+
+/**
+ * \brief Returns numerical values that correspond to the least and
+ * greatest stream priorities.
+ *
+ * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond
+ * to the least and greatest stream priorities respectively. Stream priorities
+ * follow a convention where lower numbers imply greater priorities. The range of
+ * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority].
+ * If the user attempts to create a stream with a priority value that is
+ * outside the meaningful range as specified by this API, the priority is
+ * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority
+ * respectively. See ::cuStreamCreateWithPriority for details on creating a
+ * priority stream.
+ * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value
+ * is not desired.
+ *
+ * This function will return '0' in both \p *leastPriority and \p *greatestPriority if
+ * the current context's device does not support stream priorities
+ * (see ::cuDeviceGetAttribute).
+ *
+ * \param leastPriority    - Pointer to an int in which the numerical value for least
+ *                           stream priority is returned
+ * \param greatestPriority - Pointer to an int in which the numerical value for greatest
+ *                           stream priority is returned
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreateWithPriority,
+ * ::cuStreamGetPriority,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceGetStreamPriorityRange
+ */
+CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority);
+
+/**
+ * \brief Resets all persisting lines in cache to normal status.
+ *
+ * ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal
+ * status. Takes effect on function return.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuCtxResetPersistingL2Cache(void);
+
+/**
+ * \brief Returns the execution affinity setting for the current context.
+ *
+ * Returns in \p *pExecAffinity the current value of \p type. The supported
+ * ::CUexecAffinityType values are:
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: number of SMs the context is limited to use.
+ *
+ * \param type          - Execution affinity type to query
+ * \param pExecAffinity - Returned execution affinity
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY
+ * \notefnerr
+ *
+ * \sa
+ * ::CUexecAffinityParam
+ */
+CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
+
+/**
+ * \brief Records an event.
+ *
+ * Captures in \p hEvent all the activities of the context \p hCtx
+ * at the time of this call. \p hEvent and \p hCtx must be from the same
+ * CUDA context, otherwise ::CUDA_ERROR_INVALID_HANDLE will be returned.
+ * Calls such as ::cuEventQuery() or ::cuCtxWaitEvent() will then examine
+ * or wait for completion of the work that was captured.
+ * Uses of \p hCtx after this call do not modify \p hEvent.
+ * If the context passed to \p hCtx is the primary context, \p hEvent will
+ * capture all the activities of the primary context and its green contexts.
+ * If the context passed to \p hCtx is a context converted from green context
+ * via ::cuCtxFromGreenCtx(), \p hEvent will capture only the activities of the green context.
+ *
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED if the
+ * specified context \p hCtx has a stream in the capture mode. In such a case,
+ * the call will invalidate all the conflicting captures.
+ * 
+ * \param hCtx - Context to record event for
+ * \param hEvent - Event to record
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
+ *
+ * \sa
+ * ::cuCtxWaitEvent,
+ * ::cuGreenCtxRecordEvent,
+ * ::cuGreenCtxWaitEvent,
+ * ::cuEventRecord
+ */
+CUresult CUDAAPI cuCtxRecordEvent(CUcontext hCtx, CUevent hEvent);
+
+/**
+ * \brief Make a context wait on an event
+ *
+ * Makes all future work submitted to context \p hCtx wait for all work
+ * captured in \p hEvent. The synchronization will be performed on the device
+ * and will not block the calling CPU thread. See ::cuCtxRecordEvent()
+ * for details on what is captured by an event.
+ * If the context passed to \p hCtx is the primary context, the primary context
+ * and its green contexts will wait for \p hEvent.
+ * If the context passed to \p hCtx is a context converted from green context
+ * via ::cuCtxFromGreenCtx(), the green context will wait for \p hEvent.
+ *
+ * \note \p hEvent may be from a different context or device than \p hCtx.
+ *
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED and
+ * invalidate the capture if the specified event \p hEvent is part of an ongoing
+ * capture sequence or if the specified context \p hCtx has a stream in the capture mode.
+ *
+ * \param hCtx    - Context to wait
+ * \param hEvent  - Event to wait on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
+ *
+ * \sa
+ * ::cuCtxRecordEvent,
+ * ::cuGreenCtxRecordEvent,
+ * ::cuGreenCtxWaitEvent,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuCtxWaitEvent(CUcontext hCtx, CUevent hEvent);
+
+/** @} */ /* END CUDA_CTX */
+
+/**
+ * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated context management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated context management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Increment a context's usage-count
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated and should not be used.
+ *
+ * Increments the usage count of the context and passes back a context handle
+ * in \p *pctx that must be passed to ::cuCtxDetach() when the application is
+ * done with the context. ::cuCtxAttach() fails if there is no context current
+ * to the thread.
+ *
+ * Currently, the \p flags parameter must be 0.
+ *
+ * \param pctx  - Returned context handle of the current context
+ * \param flags - Context attach flags (must be 0)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxDetach,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags);
+
+/**
+ * \brief Decrement a context's usage-count
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated and should not be used.
+ *
+ * Decrements the usage count of the context \p ctx, and destroys the context
+ * if the usage count goes to 0. The context must be a handle that was passed
+ * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the
+ * calling thread.
+ *
+ * \param ctx - Context to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx);
+
+
+/**
+ * \brief Returns the current shared memory configuration for the current context.
+ *
+ * \deprecated
+ *
+ * This function will return in \p pConfig the current size of shared memory banks
+ * in the current context. On devices with configurable shared memory banks,
+ * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all
+ * subsequent kernel launches will by default use the new bank size. When
+ * ::cuCtxGetSharedMemConfig is called on devices without configurable shared
+ * memory, it will return the fixed bank size of the hardware.
+ *
+ * The returned bank configurations can be either:
+ * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE:  shared memory bank width is
+ *   four bytes.
+ * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will
+ *   eight bytes.
+ *
+ * \param pConfig - returned shared memory configuration
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceGetSharedMemConfig
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig);
+
+/**
+ * \brief Sets the shared memory configuration for the current context.
+ *
+ * \deprecated
+ *
+ * On devices with configurable shared memory banks, this function will set
+ * the context's shared memory bank size which is used for subsequent kernel
+ * launches.
+ *
+ * Changed the shared memory configuration between launches may insert a device
+ * side synchronization point between those launches.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance.
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * The supported bank configurations are:
+ * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial
+ *   setting (currently, four bytes).
+ * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively four bytes.
+ * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively eight bytes.
+ *
+ * \param config - requested shared memory configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceSetSharedMemConfig
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config);
+
+/** @} */ /* END CUDA_CTX_DEPRECATED */
+
+
+/**
+ * \defgroup CUDA_MODULE Module Management
+ *
+ * ___MANBRIEF___ module management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the module management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Loads a compute module
+ *
+ * Takes a filename \p fname and loads the corresponding module \p module into
+ * the current context. The CUDA driver API does not attempt to lazily
+ * allocate the resources needed by a module; if the memory for functions and
+ * data (constant and global) needed by the module cannot be allocated,
+ * ::cuModuleLoad() fails. The file should be a \e cubin file as output by
+ * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or
+ * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later.
+ *
+ * \param module - Returned module
+ * \param fname  - Filename of module to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_FILE_NOT_FOUND,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
+
+/**
+ * \brief Load a module's data
+ *
+ * Takes a pointer \p image and loads the corresponding module \p module into
+ * the current context. The \p image may be a \e cubin or \e fatbin
+ * as output by \b nvcc, or a NULL-terminated \e PTX, either as output by \b nvcc
+ * or hand-written.
+ *
+ * \param module - Returned module
+ * \param image  - Module data to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image);
+
+/**
+ * \brief Load a module's data with options
+ *
+ * Takes a pointer \p image and loads the corresponding module \p module into
+ * the current context. The \p image may be a \e cubin or \e fatbin
+ * as output by \b nvcc, or a NULL-terminated \e PTX, either as output by \b nvcc
+ * or hand-written.
+ *
+ * \param module       - Returned module
+ * \param image        - Module data to load
+ * \param numOptions   - Number of options
+ * \param options      - Options for JIT
+ * \param optionValues - Option values for JIT
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+
+/**
+ * \brief Load a module's data
+ *
+ * Takes a pointer \p fatCubin and loads the corresponding module \p module
+ * into the current context. The pointer represents a <i>fat binary</i> object,
+ * which is a collection of different \e cubin and/or \e PTX files, all
+ * representing the same device code, but compiled and optimized for different
+ * architectures.
+ *
+ * Prior to CUDA 4.0, there was no documented API for constructing and using
+ * fat binary objects by programmers.  Starting with CUDA 4.0, fat binary
+ * objects can be constructed by providing the <i>-fatbin option</i> to \b nvcc.
+ * More information can be found in the \b nvcc document.
+ *
+ * \param module   - Returned module
+ * \param fatCubin - Fat binary to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
+
+/**
+ * \brief Unloads a module
+ *
+ * Unloads a module \p hmod from the current context. Attempting to unload
+ * a module which was obtained from the Library Management API such as
+ * ::cuLibraryGetModule will return ::CUDA_ERROR_NOT_PERMITTED.
+ *
+ * \param hmod - Module to unload
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_PERMITTED
+ * \notefnerr
+ * \note_destroy_ub
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary
+ */
+CUresult CUDAAPI cuModuleUnload(CUmodule hmod);
+
+/**
+ * CUDA Lazy Loading status
+ */
+typedef enum CUmoduleLoadingMode_enum {
+    CU_MODULE_EAGER_LOADING = 0x1, /**< Lazy Kernel Loading is not enabled */
+    CU_MODULE_LAZY_LOADING  = 0x2, /**< Lazy Kernel Loading is enabled */
+} CUmoduleLoadingMode;
+
+/**
+ * \brief Query lazy loading mode
+ *
+ * Returns lazy loading mode
+ * Module loading mode is controlled by CUDA_MODULE_LOADING env variable
+ *
+ * \param mode      - Returns the lazy loading mode
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuModuleLoad,
+ */
+CUresult CUDAAPI cuModuleGetLoadingMode(CUmoduleLoadingMode *mode);
+
+/**
+ * \brief Returns a function handle
+ *
+ * Returns in \p *hfunc the handle of the function of name \p name located in
+ * module \p hmod. If no function of that name exists, ::cuModuleGetFunction()
+ * returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param hfunc - Returned function handle
+ * \param hmod  - Module to retrieve function from
+ * \param name  - Name of function to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
+
+/**
+ * \brief Returns the number of functions within a module
+ *
+ * Returns in \p count the number of functions in \p mod.
+ *
+ * \param count - Number of functions found within the module
+ * \param mod - Module to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ */
+CUresult CUDAAPI cuModuleGetFunctionCount(unsigned int *count, CUmodule mod);
+
+/**
+ * \brief Returns the function handles within a module.
+ *
+ * Returns in \p functions a maximum number of \p numFunctions function handles within \p mod. When
+ * function loading mode is set to LAZY the function retrieved may be partially loaded. The loading
+ * state of a function can be queried using ::cuFunctionIsLoaded. CUDA APIs may load the function
+ * automatically when called with partially loaded function handle which may incur additional
+ * latency. Alternatively, ::cuFunctionLoad can be used to explicitly load a function. The returned
+ * function handles become invalid when the module is unloaded.
+ *
+ * \param functions - Buffer where the function handles are returned to
+ * \param numFunctions - Maximum number of function handles may be returned to the buffer
+ * \param mod - Module to query from
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetFunctionCount,
+ * ::cuFuncIsLoaded,
+ * ::cuFuncLoad
+ */
+CUresult CUDAAPI cuModuleEnumerateFunctions(CUfunction *functions, unsigned int numFunctions, CUmodule mod);
+
+/**
+ * \brief Returns a global pointer from a module
+ *
+ * Returns in \p *dptr and \p *bytes the base pointer and size of the
+ * global of name \p name located in module \p hmod. If no variable of that name
+ * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND.
+ * One of the parameters \p dptr or \p bytes (not both) can be NULL in which
+ * case it is ignored.
+ *
+ * \param dptr  - Returned global device pointer
+ * \param bytes - Returned global size in bytes
+ * \param hmod  - Module to retrieve global from
+ * \param name  - Name of global to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload,
+ * ::cudaGetSymbolAddress,
+ * ::cudaGetSymbolSize
+ */
+CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
+
+/**
+ * \brief Creates a pending JIT linker invocation.
+ *
+ * If the call is successful, the caller owns the returned CUlinkState, which
+ * should eventually be destroyed with ::cuLinkDestroy.  The
+ * device code machine size (32 or 64 bit) will match the calling application.
+ *
+ * Both linker and compiler options may be specified.  Compiler options will
+ * be applied to inputs to this linker action which must be compiled from PTX.
+ * The options ::CU_JIT_WALL_TIME,
+ * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+ * will accumulate data until the CUlinkState is destroyed.
+ *
+ * The data passed in via ::cuLinkAddData and ::cuLinkAddFile will be treated
+ * as relocatable (-rdc=true to nvcc) when linking the final cubin during 
+ * ::cuLinkComplete and will have similar consequences as offline relocatable 
+ * device code linking.
+ *
+ * \p optionValues must remain valid for the life of the CUlinkState if output
+ * options are used.  No other references to inputs are maintained after this
+ * call returns.
+ *
+ * \note For LTO-IR input, only LTO-IR compiled with toolkits prior to CUDA 12.0 will be accepted
+ *
+ * \param numOptions   Size of options arrays
+ * \param options      Array of linker and compiler options
+ * \param optionValues Array of option values, each cast to void *
+ * \param stateOut     On success, this will contain a CUlinkState to specify
+ *                     and complete this action
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuLinkAddData,
+ * ::cuLinkAddFile,
+ * ::cuLinkComplete,
+ * ::cuLinkDestroy
+ */
+CUresult CUDAAPI
+cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+
+/**
+ * \brief Add an input to a pending linker invocation
+ *
+ * Ownership of \p data is retained by the caller.  No reference is retained to any
+ * inputs after this call returns.
+ *
+ * This method accepts only compiler options, which are used if the data must
+ * be compiled from PTX, and does not accept any of
+ * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
+ * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
+ *
+ * \note For LTO-IR input, only LTO-IR compiled with toolkits prior to CUDA 12.0 will be accepted
+ *
+ * \param state        A pending linker action.
+ * \param type         The type of the input data.
+ * \param data         The input data.  PTX must be NULL-terminated.
+ * \param size         The length of the input data.
+ * \param name         An optional name for this input in log messages.
+ * \param numOptions   Size of options.
+ * \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate).
+ * \param optionValues Array of option values, each cast to void *.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU
+ *
+ * \sa ::cuLinkCreate,
+ * ::cuLinkAddFile,
+ * ::cuLinkComplete,
+ * ::cuLinkDestroy
+ */
+CUresult CUDAAPI
+cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
+    unsigned int numOptions, CUjit_option *options, void **optionValues);
+
+/**
+ * \brief Add a file input to a pending linker invocation
+ *
+ * No reference is retained to any inputs after this call returns.
+ *
+ * This method accepts only compiler options, which are used if the input
+ * must be compiled from PTX, and does not accept any of
+ * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
+ * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
+ *
+ * This method is equivalent to invoking ::cuLinkAddData on the contents
+ * of the file.
+ *
+ * \note For LTO-IR input, only LTO-IR compiled with toolkits prior to CUDA 12.0 will be accepted
+ *
+ * \param state        A pending linker action
+ * \param type         The type of the input data
+ * \param path         Path to the input file
+ * \param numOptions   Size of options
+ * \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate)
+ * \param optionValues Array of option values, each cast to void *
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_FILE_NOT_FOUND
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU
+ *
+ * \sa ::cuLinkCreate,
+ * ::cuLinkAddData,
+ * ::cuLinkComplete,
+ * ::cuLinkDestroy
+ */
+CUresult CUDAAPI
+cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
+    unsigned int numOptions, CUjit_option *options, void **optionValues);
+
+/**
+ * \brief Complete a pending linker invocation
+ *
+ * Completes the pending linker action and returns the cubin image for the linked
+ * device code, which can be used with ::cuModuleLoadData.  The cubin is owned by
+ * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy.
+ * This call does not destroy \p state.
+ *
+ * \param state    A pending linker invocation
+ * \param cubinOut On success, this will point to the output image
+ * \param sizeOut  Optional parameter to receive the size of the generated image
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuLinkCreate,
+ * ::cuLinkAddData,
+ * ::cuLinkAddFile,
+ * ::cuLinkDestroy,
+ * ::cuModuleLoadData
+ */
+CUresult CUDAAPI
+cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut);
+
+/**
+ * \brief Destroys state for a JIT linker invocation.
+ *
+ * \param state State object for the linker invocation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ *
+ * \sa ::cuLinkCreate
+ */
+CUresult CUDAAPI
+cuLinkDestroy(CUlinkState state);
+
+/** @} */ /* END CUDA_MODULE */
+
+/**
+ * \defgroup CUDA_MODULE_DEPRECATED Module Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated module management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated module management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns a handle to a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pTexRef the handle of the texture reference of name \p name
+ * in the module \p hmod. If no texture reference of that name exists,
+ * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference
+ * handle should not be destroyed, since it will be destroyed when the module
+ * is unloaded.
+ *
+ * \param pTexRef  - Returned texture reference
+ * \param hmod     - Module to retrieve texture reference from
+ * \param name     - Name of texture reference to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa
+ * ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetSurfRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
+
+/**
+ * \brief Returns a handle to a surface reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pSurfRef the handle of the surface reference of name \p name
+ * in the module \p hmod. If no surface reference of that name exists,
+ * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param pSurfRef  - Returned surface reference
+ * \param hmod     - Module to retrieve surface reference from
+ * \param name     - Name of surface reference to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa
+ * ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
+
+/** @} */ /* END CUDA_MODULE_DEPRECATED */
+
+/**
+ * \defgroup CUDA_LIBRARY Library Management
+ *
+ * ___MANBRIEF___ library management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the library management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Load a library with specified code and options
+ *
+ * Takes a pointer \p code and loads the corresponding library \p library based on
+ * the application defined library loading mode:
+ * - If module loading is set to EAGER, via the environment variables described in "Module loading",
+ *   \p library is loaded eagerly into all contexts at the time of the call and future contexts
+ *   at the time of creation until the library is unloaded with ::cuLibraryUnload().
+ * - If the environment variables are set to LAZY, \p library
+ *   is not immediately loaded onto all existent contexts and will only be
+ *   loaded when a function is needed for that context, such as a kernel launch.
+ *
+ * These environment variables are described in the CUDA programming guide under the 
+ * "CUDA environment variables" section.
+ *
+ * The \p code may be a \e cubin or \e fatbin as output by \b nvcc,
+ * or a NULL-terminated \e PTX, either as output by \b nvcc or hand-written.
+ * A fatbin should also contain relocatable code when doing separate compilation.
+ *
+ * Options are passed as an array via \p jitOptions and any corresponding parameters are passed in
+ * \p jitOptionsValues. The number of total JIT options is supplied via \p numJitOptions.
+ * Any outputs will be returned via \p jitOptionsValues.
+ *
+ * Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
+ * \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
+ *
+ * \note If the library contains managed variables and no device in the system
+ * supports managed variables this call is expected to return ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \param library             - Returned library
+ * \param code                - Code to load
+ * \param jitOptions          - Options for JIT
+ * \param jitOptionsValues    - Option values for JIT
+ * \param numJitOptions       - Number of options
+ * \param libraryOptions      - Options for loading
+ * \param libraryOptionValues - Option values for loading
+ * \param numLibraryOptions   - Number of options for loading
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx
+ */
+CUresult CUDAAPI cuLibraryLoadData(CUlibrary *library, const void *code,
+                                   CUjit_option *jitOptions, void **jitOptionsValues, unsigned int numJitOptions,
+                                   CUlibraryOption *libraryOptions, void** libraryOptionValues, unsigned int numLibraryOptions);
+
+/**
+ * \brief Load a library with specified file and options
+ *
+ * Takes a pointer \p code and loads the corresponding library \p library based on
+ * the application defined library loading mode:
+ * - If module loading is set to EAGER, via the environment variables described in "Module loading",
+ *   \p library is loaded eagerly into all contexts at the time of the call and future contexts
+ *   at the time of creation until the library is unloaded with ::cuLibraryUnload().
+ * - If the environment variables are set to LAZY, \p library
+ *   is not immediately loaded onto all existent contexts and will only be
+ *   loaded when a function is needed for that context, such as a kernel launch.
+ *
+ * These environment variables are described in the CUDA programming guide under the 
+ * "CUDA environment variables" section.
+ *
+ * The file should be a \e cubin file as output by \b nvcc, or a \e PTX file either
+ * as output by \b nvcc or handwritten, or a \e fatbin file as output by \b nvcc.
+ * A fatbin should also contain relocatable code when doing separate compilation.
+ *
+ * Options are passed as an array via \p jitOptions and any corresponding parameters are
+ * passed in \p jitOptionsValues. The number of total options is supplied via \p numJitOptions.
+ * Any outputs will be returned via \p jitOptionsValues.
+ *
+ * Library load options are passed as an array via \p libraryOptions and any corresponding parameters are passed in
+ * \p libraryOptionValues. The number of total library load options is supplied via \p numLibraryOptions.
+ *
+ * \note If the library contains managed variables and no device in the system
+ * supports managed variables this call is expected to return ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \param library             - Returned library
+ * \param fileName            - File to load from
+ * \param jitOptions          - Options for JIT
+ * \param jitOptionsValues    - Option values for JIT
+ * \param numJitOptions       - Number of options
+ * \param libraryOptions      - Options for loading
+ * \param libraryOptionValues - Option values for loading
+ * \param numLibraryOptions   - Number of options for loading
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryUnload,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx
+ */
+CUresult CUDAAPI cuLibraryLoadFromFile(CUlibrary *library, const char *fileName,
+                                       CUjit_option *jitOptions, void **jitOptionsValues, unsigned int numJitOptions,
+                                       CUlibraryOption *libraryOptions, void **libraryOptionValues, unsigned int numLibraryOptions);
+
+/**
+ * \brief Unloads a library
+ *
+ * Unloads the library specified with \p library
+ *
+ * \param library - Library to unload
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuLibraryUnload(CUlibrary library);
+
+/**
+ * \brief Returns a kernel handle
+ *
+ * Returns in \p pKernel the handle of the kernel with name \p name located in library \p library.
+ * If kernel handle is not found, the call returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param pKernel - Returned kernel handle
+ * \param library - Library to retrieve kernel from
+ * \param name - Name of kernel to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_FOUND
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuKernelGetFunction,
+ * ::cuLibraryGetModule,
+ * ::cuModuleGetFunction
+ */
+CUresult CUDAAPI cuLibraryGetKernel(CUkernel *pKernel, CUlibrary library, const char *name);
+
+/**
+ * \brief Returns the number of kernels within a library
+ *
+ * Returns in \p count the number of kernels in \p lib.
+ *
+ * \param count - Number of kernels found within the library
+ * \param lib - Library to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ */
+CUresult CUDAAPI cuLibraryGetKernelCount(unsigned int *count, CUlibrary lib);
+ 
+/**
+ * \brief Retrieve the kernel handles within a library.
+ *
+ * Returns in \p kernels a maximum number of \p numKernels kernel handles within \p lib.
+ * The returned kernel handle becomes invalid when the library is unloaded.
+ *
+ * \param kernels - Buffer where the kernel handles are returned to
+ * \param numKernels - Maximum number of kernel handles may be returned to the buffer
+ * \param lib - Library to query from
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuLibraryGetKernelCount
+ */
+CUresult CUDAAPI cuLibraryEnumerateKernels(CUkernel *kernels, unsigned int numKernels, CUlibrary lib);
+
+/**
+ * \brief Returns a module handle
+ *
+ * Returns in \p pMod the module handle associated with the current context located in
+ * library \p library. If module handle is not found, the call returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param pMod - Returned module handle
+ * \param library - Library to retrieve module from
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuModuleGetFunction
+ */
+CUresult CUDAAPI cuLibraryGetModule(CUmodule *pMod, CUlibrary library);
+
+/**
+ * \brief Returns a function handle
+ *
+ * Returns in \p pFunc the handle of the function for the requested kernel \p kernel and
+ * the current context. If function handle is not found, the call returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param pFunc - Returned function handle
+ * \param kernel - Kernel to retrieve function for the requested context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuLibraryGetKernel,
+ * ::cuLibraryGetModule,
+ * ::cuModuleGetFunction
+ */
+CUresult CUDAAPI cuKernelGetFunction(CUfunction *pFunc, CUkernel kernel);
+
+/**
+ * \brief Returns a library handle
+ *
+ * Returns in \p pLib the handle of the library for the requested kernel \p kernel
+ *
+ * \param pLib - Returned library handle
+ * \param kernel - Kernel to retrieve library handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_FOUND
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuLibraryGetKernel
+ */
+CUresult CUDAAPI cuKernelGetLibrary(CUlibrary *pLib, CUkernel kernel);
+
+/**
+ * \brief Returns a global device pointer
+ *
+ * Returns in \p *dptr and \p *bytes the base pointer and size of the global with
+ * name \p name for the requested library \p library and the current context.
+ * If no global for the requested name \p name exists, the call returns ::CUDA_ERROR_NOT_FOUND.
+ * One of the parameters \p dptr or \p bytes (not both) can be NULL in which
+ * case it is ignored.
+ *
+ * \param dptr - Returned global device pointer for the requested context
+ * \param bytes - Returned global size in bytes
+ * \param library - Library to retrieve global from
+ * \param name - Name of global to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuLibraryGetModule,
+ * cuModuleGetGlobal
+ */
+CUresult CUDAAPI cuLibraryGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUlibrary library, const char *name);
+
+/**
+ * \brief Returns a pointer to managed memory
+ *
+ * Returns in \p *dptr and \p *bytes the base pointer and size of the managed memory with
+ * name \p name for the requested library \p library. If no managed memory with the
+ * requested name \p name exists, the call returns ::CUDA_ERROR_NOT_FOUND. One of the parameters
+ * \p dptr or \p bytes (not both) can be NULL in which case it is ignored.
+ * Note that managed memory for library \p library is shared across devices and is registered
+ * when the library is loaded into atleast one context.
+ *
+ * \param dptr - Returned pointer to the managed memory
+ * \param bytes - Returned memory size in bytes
+ * \param library - Library to retrieve managed memory from
+ * \param name - Name of managed memory to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_FOUND
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload
+ */
+CUresult CUDAAPI cuLibraryGetManaged(CUdeviceptr *dptr, size_t *bytes, CUlibrary library, const char *name);
+
+/**
+ * \brief Returns a pointer to a unified function
+ *
+ * Returns in \p *fptr the function pointer to a unified function denoted by \p symbol.
+ * If no unified function with name \p symbol exists, the call returns ::CUDA_ERROR_NOT_FOUND.
+ * If there is no device with attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS present in the system,
+ * the call may return ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param fptr - Returned pointer to a unified function
+ * \param library - Library to retrieve function pointer memory from
+ * \param symbol - Name of function pointer to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_FOUND
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload
+ */
+CUresult CUDAAPI cuLibraryGetUnifiedFunction(void **fptr, CUlibrary library, const char *symbol);
+
+/**
+ * \brief Returns information about a kernel
+ *
+ * Returns in \p *pi the integer value of the attribute \p attrib for the kernel
+ * \p kernel for the requested device \p dev. The supported attributes are:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads
+ *   per block, beyond which a launch of the kernel would fail. This number
+ *   depends on both the kernel and the requested device.
+ * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of
+ *   statically-allocated shared memory per block required by this kernel.
+ *   This does not include dynamically-allocated shared memory requested by
+ *   the user at runtime.
+ * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated
+ *   constant memory required by this kernel.
+ * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory
+ *   used by each thread of this kernel.
+ * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread
+ *   of this kernel.
+ * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for
+ *   which the kernel was compiled. This value is the major PTX version * 10
+ *   + the minor PTX version, so a PTX version 1.3 function would return the
+ *   value 13. Note that this may return the undefined value of 0 for cubins
+ *   compiled prior to CUDA 3.0.
+ * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for
+ *   which the kernel was compiled. This value is the major binary
+ *   version * 10 + the minor binary version, so a binary version 1.3 function
+ *   would return the value 13. Note that this will return a value of 10 for
+ *   legacy cubins that do not have a properly-encoded binary architecture
+ *   version.
+ * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the kernel has
+ *   been compiled with user specified option "-Xptxas --dlcm=ca" set.
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of
+ *   dynamically-allocated shared memory.
+ * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1
+ *   cache split ratio in percent of total shared memory.
+ * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET: If this attribute is set, the
+ *   kernel must launch with a valid cluster size specified.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in
+ *   blocks.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in
+ *   blocks.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in
+ *   blocks.
+ * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
+ *   the function can be launched with non-portable cluster size. 1 is allowed,
+ *   0 is disallowed. A non-portable cluster size may only function on the
+ *   specific SKUs the program is tested on. The launch might fail if the
+ *   program is run on a different hardware platform. CUDA API provides
+ *   cudaOccupancyMaxActiveClusters to assist with checking whether the desired
+ *   size can be launched on the current device. A portable cluster size is
+ *   guaranteed to be functional on all compute capabilities higher than the
+ *   target compute capability. The portable cluster size for sm_90 is 8 blocks
+ *   per cluster. This value may increase for future compute capabilities. The
+ *   specific hardware unit may support higher cluster sizes that’s not
+ *   guaranteed to be portable.
+ * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
+ *   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
+ *
+ * \note If another thread is trying to set the same attribute on the same device using
+ * ::cuKernelSetAttribute() simultaneously, the attribute query will give the old or new
+ * value depending on the interleavings chosen by the OS scheduler and memory consistency.
+ *
+ * \param pi     - Returned attribute value
+ * \param attrib - Attribute requested
+ * \param kernel  - Kernel to query attribute of
+ * \param dev - Device to query attribute of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuKernelSetAttribute,
+ * ::cuLibraryGetKernel,
+ * ::cuLaunchKernel,
+ * ::cuKernelGetFunction,
+ * ::cuLibraryGetModule,
+ * ::cuModuleGetFunction,
+ * ::cuFuncGetAttribute
+ */
+CUresult CUDAAPI cuKernelGetAttribute(int *pi, CUfunction_attribute attrib, CUkernel kernel, CUdevice dev);
+
+/**
+ * \brief Sets information about a kernel
+ *
+ * This call sets the value of a specified attribute \p attrib on the kernel \p kernel
+ * for the requested device \p dev to an integer value specified by \p val.
+ * This function returns CUDA_SUCCESS if the new value of the attribute could be
+ * successfully set. If the set fails, this call will return an error.
+ * Not all attributes can have values set. Attempting to set a value on a read-only
+ * attribute will result in an error (CUDA_ERROR_INVALID_VALUE)
+ *
+ * Note that attributes set using ::cuFuncSetAttribute() will override the attribute
+ * set by this API irrespective of whether the call to ::cuFuncSetAttribute() is made
+ * before or after this API call. However, ::cuKernelGetAttribute() will always
+ * return the attribute value set by this API.
+ *
+ * Supported attributes are:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This is the maximum size in bytes of
+ *   dynamically-allocated shared memory. The value should contain the requested
+ *   maximum size of dynamically-allocated shared memory. The sum of this value and
+ *   the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the
+ *   device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN.
+ *   The maximal size of requestable dynamic shared memory may differ by GPU
+ *   architecture.
+ * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1
+ *   cache and shared memory use the same hardware resources, this sets the shared memory
+ *   carveout preference, in percent of the total shared memory.
+ *   See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR
+ *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
+ *   the function can be launched with non-portable cluster size. 1 is allowed,
+ *   0 is disallowed.
+ * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
+ *   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
+ *
+ * \note The API has stricter locking requirements in comparison to its legacy counterpart
+ * ::cuFuncSetAttribute() due to device-wide semantics. If multiple threads are trying to
+ * set the same attribute on the same device simultaneously, the attribute setting will depend
+ * on the interleavings chosen by the OS scheduler and memory consistency.
+ *
+ * \param attrib - Attribute requested
+ * \param val - Value to set
+ * \param kernel  - Kernel to set attribute of
+ * \param dev - Device to set attribute of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuKernelGetAttribute,
+ * ::cuLibraryGetKernel,
+ * ::cuLaunchKernel,
+ * ::cuKernelGetFunction,
+ * ::cuLibraryGetModule,
+ * ::cuModuleGetFunction,
+ * ::cuFuncSetAttribute
+ */
+CUresult CUDAAPI cuKernelSetAttribute(CUfunction_attribute attrib, int val, CUkernel kernel, CUdevice dev);
+
+/**
+ * \brief Sets the preferred cache configuration for a device kernel.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p config the preferred cache configuration for
+ * the device kernel \p kernel on the requested device \p dev. This is only a preference.
+ * The driver will use the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute \p kernel.  Any context-wide preference
+ * set via ::cuCtxSetCacheConfig() will be overridden by this per-kernel
+ * setting.
+ *
+ * Note that attributes set using ::cuFuncSetCacheConfig() will override the attribute
+ * set by this API irrespective of whether the call to ::cuFuncSetCacheConfig() is made
+ * before or after this API call.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \note The API has stricter locking requirements in comparison to its legacy counterpart
+ * ::cuFuncSetCacheConfig() due to device-wide semantics. If multiple threads are trying to
+ * set a config on the same device simultaneously, the cache config setting will depend
+ * on the interleavings chosen by the OS scheduler and memory consistency.
+ *
+ * \param kernel  - Kernel to configure cache for
+ * \param config - Requested cache configuration
+ * \param dev - Device to set attribute of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuLibraryLoadData,
+ * ::cuLibraryLoadFromFile,
+ * ::cuLibraryUnload,
+ * ::cuLibraryGetKernel,
+ * ::cuKernelGetFunction,
+ * ::cuLibraryGetModule,
+ * ::cuModuleGetFunction,
+ * ::cuFuncSetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuLaunchKernel
+ */
+CUresult CUDAAPI cuKernelSetCacheConfig(CUkernel kernel, CUfunc_cache config, CUdevice dev);
+
+/**
+ * \brief Returns the function name for a ::CUkernel handle
+ *
+ * Returns in \p **name the function name associated with the kernel handle \p hfunc .
+ * The function name is returned as a null-terminated string. The returned name is only 
+ * valid when the kernel handle is valid. If the library is unloaded or reloaded, one 
+ * must call the API again to get the updated name. This API may return a mangled name if
+ * the function is not declared as having C linkage. If either \p **name or \p hfunc 
+ * is NULL, ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param name - The returned name of the function
+ * \param hfunc - The function handle to retrieve the name for 
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ */
+CUresult CUDAAPI cuKernelGetName(const char **name, CUkernel hfunc);
+
+/**
+ * \brief Returns the offset and size of a kernel parameter in the device-side parameter layout
+ *
+ * Queries the kernel parameter at \p paramIndex into \p kernel's list of parameters, and returns
+ * in \p paramOffset and \p paramSize the offset and size, respectively, where the parameter
+ * will reside in the device-side parameter layout. This information can be used to update kernel
+ * node parameters from the device via ::cudaGraphKernelNodeSetParam() and
+ * ::cudaGraphKernelNodeUpdatesApply(). \p paramIndex must be less than the number of parameters
+ * that \p kernel takes. \p paramSize can be set to NULL if only the parameter offset is desired.
+ *
+ * \param kernel      - The kernel to query
+ * \param paramIndex  - The parameter index to query
+ * \param paramOffset - Returns the offset into the device-side parameter layout at which the parameter resides
+ * \param paramSize   - Optionally returns the size of the parameter in the device-side parameter layout
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+* \sa ::cuFuncGetParamInfo
+ */
+CUresult CUDAAPI cuKernelGetParamInfo(CUkernel kernel, size_t paramIndex, size_t *paramOffset, size_t *paramSize);
+/** @} */ /* END CUDA_LIBRARY */
+
+/**
+ * \defgroup CUDA_MEM Memory Management
+ *
+ * ___MANBRIEF___ memory management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the memory management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Gets free and total memory
+ *
+ * Returns in \p *total the total amount of memory available to the the current context.
+ * Returns in \p *free the amount of memory on the device that is free according to the OS.
+ * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free.
+ * In a multi-tenet situation, free estimate returned is prone to race condition where
+ * a new allocation/free done by a different process or a different thread in the same
+ * process between the time when free memory was estimated and reported, will result in
+ * deviation in free value reported and actual free memory.
+ *
+ * The integrated GPU on Tegra shares memory with CPU and other component
+ * of the SoC. The free and total values returned by the API excludes
+ * the SWAP memory space maintained by the OS on some platforms.
+ * The OS may move some of the memory pages into swap area as the GPU or
+ * CPU allocate or access memory. See Tegra app note on how to calculate
+ * total and free memory on Tegra.
+ *
+ * \param free  - Returned free memory in bytes
+ * \param total - Returned total memory in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemGetInfo
+ */
+CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total);
+
+/**
+ * \brief Allocates device memory
+ *
+ * Allocates \p bytesize bytes of linear memory on the device and returns in
+ * \p *dptr a pointer to the allocated memory. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p bytesize
+ * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE.
+ *
+ * \param dptr     - Returned device pointer
+ * \param bytesize - Requested allocation size in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMalloc
+ */
+CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
+
+/**
+ * \brief Allocates pitched device memory
+ *
+ * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on
+ * the device and returns in \p *dptr a pointer to the allocated memory. The
+ * function may pad the allocation to ensure that corresponding pointers in
+ * any given row will continue to meet the alignment requirements for
+ * coalescing as the address is updated from row to row. \p ElementSizeBytes
+ * specifies the size of the largest reads and writes that will be performed
+ * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced
+ * memory transactions are not possible on other data sizes). If
+ * \p ElementSizeBytes is smaller than the actual read/write size of a kernel,
+ * the kernel will run correctly, but possibly at reduced speed. The pitch
+ * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the
+ * allocation. The intended usage of pitch is as a separate parameter of the
+ * allocation, used to compute addresses within the 2D array. Given the row
+ * and column of an array element of type \b T, the address is computed as:
+ * \code
+   T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
+ * \endcode
+ *
+ * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with
+ * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is
+ * recommended that programmers consider performing pitch allocations using
+ * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is
+ * especially true if the application will be performing 2D memory copies
+ * between different regions of device memory (whether linear memory or CUDA
+ * arrays).
+ *
+ * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed
+ * to match or exceed the alignment requirement for texture binding with
+ * ::cuTexRefSetAddress2D().
+ *
+ * \param dptr             - Returned device pointer
+ * \param pPitch           - Returned pitch of allocation in bytes
+ * \param WidthInBytes     - Requested allocation width in bytes
+ * \param Height           - Requested allocation height in rows
+ * \param ElementSizeBytes - Size of largest reads/writes for range
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMallocPitch
+ */
+CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
+
+/**
+ * \brief Frees device memory
+ *
+ * Frees the memory space pointed to by \p dptr, which must have been returned
+ * by a previous call to one of the following memory allocation APIs - ::cuMemAlloc(), 
+ * ::cuMemAllocPitch(), ::cuMemAllocManaged(), ::cuMemAllocAsync(), ::cuMemAllocFromPoolAsync()
+ *
+ * Note - This API will not perform any implict synchronization when the pointer was allocated with
+ * ::cuMemAllocAsync or ::cuMemAllocFromPoolAsync. Callers must ensure that all accesses to these
+ * pointer have completed before invoking ::cuMemFree. For best performance and memory reuse, users
+ * should use ::cuMemFreeAsync to free memory allocated via the stream ordered memory allocator.
+ * For all other pointers, this API may perform implicit synchronization.
+ * 
+ * \param dptr - Pointer to memory to free
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemAllocManaged, ::cuMemAllocAsync, ::cuMemAllocFromPoolAsync, 
+ * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, ::cuMemcpy3D, ::cuMemcpy3DAsync,
+ * ::cuMemcpyAtoA, ::cuMemcpyAtoD, ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ * ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA,
+ * ::cuMemcpyHtoAAsync, ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, ::cuMemFreeAsync,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaFree
+ */
+CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
+
+/**
+ * \brief Get information on memory allocations
+ *
+ * Returns the base address in \p *pbase and size in \p *psize of the
+ * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input
+ * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one
+ * of them is NULL, it is ignored.
+ *
+ * \param pbase - Returned base address
+ * \param psize - Returned size of device memory allocation
+ * \param dptr  - Device pointer to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
+ */
+CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
+
+/**
+ * \brief Allocates page-locked host memory
+ *
+ * Allocates \p bytesize bytes of host memory that is page-locked and
+ * accessible to the device. The driver tracks the virtual memory ranges
+ * allocated with this function and automatically accelerates calls to
+ * functions such as ::cuMemcpy(). Since the memory can be accessed directly by
+ * the device, it can be read or written with much higher bandwidth than
+ * pageable memory obtained with functions such as ::malloc(). 
+ *
+ * On systems where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES 
+ * is true, ::cuMemAllocHost may not page-lock the allocated memory.
+ *
+ * Page-locking excessive amounts of memory with ::cuMemAllocHost() may degrade system
+ * performance, since it reduces the amount of memory available to the system
+ * for paging. As a result, this function is best used sparingly to allocate
+ * staging areas for data exchange between host and device.
+ *
+ * Note all host memory allocated using ::cuMemAllocHost() will automatically
+ * be immediately accessible to all contexts on all devices which support unified
+ * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
+ * The device pointer that may be used to access this host memory from those
+ * contexts is always equal to the returned host pointer \p *pp.
+ * See \ref CUDA_UNIFIED for additional details.
+ *
+ * \param pp       - Returned pointer to host memory
+ * \param bytesize - Requested allocation size in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMallocHost
+ */
+CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize);
+
+/**
+ * \brief Frees page-locked host memory
+ *
+ * Frees the memory space pointed to by \p p, which must have been returned by
+ * a previous call to ::cuMemAllocHost().
+ *
+ * \param p - Pointer to memory to free
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaFreeHost
+ */
+CUresult CUDAAPI cuMemFreeHost(void *p);
+
+/**
+ * \brief Allocates page-locked host memory
+ *
+ * Allocates \p bytesize bytes of host memory that is page-locked and accessible
+ * to the device. The driver tracks the virtual memory ranges allocated with
+ * this function and automatically accelerates calls to functions such as
+ * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device,
+ * it can be read or written with much higher bandwidth than pageable memory
+ * obtained with functions such as ::malloc(). 
+ *
+ * On systems where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES 
+ * is true, ::cuMemHostAlloc may not page-lock the allocated memory.
+ *
+ * Page-locking excessive amounts of memory may degrade system performance, 
+ * since it reduces the amount of memory available to the system for paging. 
+ * As a result, this function is best used sparingly to allocate staging areas 
+ * for data exchange between host and device.
+ *
+ * The \p Flags parameter enables different options to be specified that
+ * affect the allocation, as follows.
+ *
+ * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be
+ *   considered as pinned memory by all CUDA contexts, not just the one that
+ *   performed the allocation.
+ *
+ * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address
+ *   space. The device pointer to the memory may be obtained by calling
+ *   ::cuMemHostGetDevicePointer().
+ *
+ * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined
+ *   (WC). WC memory can be transferred across the PCI Express bus more
+ *   quickly on some system configurations, but cannot be read efficiently by
+ *   most CPUs. WC memory is a good option for buffers that will be written by
+ *   the CPU and read by the GPU via mapped pinned memory or host->device
+ *   transfers.
+ *
+ * All of these flags are orthogonal to one another: a developer may allocate
+ * memory that is portable, mapped and/or write-combined with no restrictions.
+ *
+ * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for
+ * devices that do not support mapped pinned memory. The failure is deferred
+ * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
+ * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag.
+ *
+ * The memory allocated by this function must be freed with ::cuMemFreeHost().
+ *
+ * Note all host memory allocated using ::cuMemHostAlloc() will automatically
+ * be immediately accessible to all contexts on all devices which support unified
+ * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
+ * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer
+ * that may be used to access this host memory from those contexts is always equal
+ * to the returned host pointer \p *pp.  If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED
+ * is specified, then the function ::cuMemHostGetDevicePointer() must be used
+ * to query the device pointer, even if the context supports unified addressing.
+ * See \ref CUDA_UNIFIED for additional details.
+ *
+ * \param pp       - Returned pointer to host memory
+ * \param bytesize - Requested allocation size in bytes
+ * \param Flags    - Flags for allocation request
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaHostAlloc
+ */
+CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
+
+/**
+ * \brief Passes back device pointer of mapped pinned memory
+ *
+ * Passes back the device pointer \p pdptr corresponding to the mapped, pinned
+ * host buffer \p p allocated by ::cuMemHostAlloc.
+ *
+ * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP
+ * flag was not specified at the time the memory was allocated, or if the
+ * function is called on a GPU that does not support mapped pinned memory.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
+ * can also be accessed from the device using the host pointer \p p.
+ * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
+ * match the original host pointer \p p and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
+ * will match the original pointer \p p. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only one of the two pointers and not both.
+ *
+ * \p Flags provides for future releases. For now, it must be set to 0.
+ *
+ * \param pdptr - Returned device pointer
+ * \param p     - Host pointer
+ * \param Flags - Options (must be 0)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaHostGetDevicePointer
+ */
+CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
+
+/**
+ * \brief Passes back flags that were used for a pinned allocation
+ *
+ * Passes back the flags \p pFlags that were specified when allocating
+ * the pinned host buffer \p p allocated by ::cuMemHostAlloc.
+ *
+ * ::cuMemHostGetFlags() will fail if the pointer does not reside in
+ * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc().
+ *
+ * \param pFlags - Returned flags word
+ * \param p     - Host pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemAllocHost,
+ * ::cuMemHostAlloc,
+ * ::cudaHostGetFlags
+ */
+CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p);
+
+/**
+ * \brief Allocates memory that will be automatically managed by the Unified Memory system
+ *
+ * Allocates \p bytesize bytes of managed memory on the device and returns in
+ * \p *dptr a pointer to the allocated memory. If the device doesn't support
+ * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support
+ * for managed memory can be queried using the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p bytesize
+ * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer
+ * is valid on the CPU and on all GPUs in the system that support managed memory.
+ * All accesses to this pointer must obey the Unified Memory programming model.
+ *
+ * \p flags specifies the default stream association for this allocation.
+ * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If
+ * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from
+ * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the
+ * allocation should not be accessed from devices that have a zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to
+ * ::cuStreamAttachMemAsync will be required to enable access on such devices.
+ *
+ * If the association is later changed via ::cuStreamAttachMemAsync to
+ * a single stream, the default association as specified during ::cuMemAllocManaged
+ * is restored when that stream is destroyed. For __managed__ variables, the
+ * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a
+ * stream is an asynchronous operation, and as a result, the change to default
+ * association won't happen until all work in the stream has completed.
+ *
+ * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree.
+ *
+ * Device memory oversubscription is possible for GPUs that have a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on
+ * such GPUs may be evicted from device memory to host memory at any time by the Unified
+ * Memory driver in order to make room for other allocations.
+ *
+ * In a system where all GPUs have a non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this
+ * API returns and instead may be populated on access. In such systems, managed memory can
+ * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
+ * maintain data locality and prevent excessive page faults to the extent possible. The application
+ * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application
+ * can also explicitly migrate memory to a desired processor's memory via
+ * ::cuMemPrefetchAsync.
+ *
+ * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support
+ * with each other, the physical storage for managed memory is created on the GPU which is active
+ * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced
+ * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
+ * memory among such GPUs.
+ *
+ * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
+ * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+ * is zero for at least one of those GPUs, the location chosen for physical storage of managed
+ * memory is system-dependent.
+ * - On Linux, the location chosen will be device memory as long as the current set of active
+ * contexts are on devices that either have peer-to-peer support with each other or have a
+ * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * If there is an active context on a GPU that does not have a non-zero value for that device
+ * attribute and it does not have peer-to-peer support with the other devices that have active
+ * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
+ * Note that this means that managed memory that is located in device memory is migrated to
+ * host memory if a new context is created on a GPU that doesn't have a non-zero value for
+ * the device attribute and does not support peer-to-peer with at least one of the other devices
+ * that has an active context. This in turn implies that context creation may fail if there is
+ * insufficient host memory to migrate all managed allocations.
+ * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
+ * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
+ * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
+ * restrict CUDA to only use those GPUs that have peer-to-peer support.
+ * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a
+ * non-zero value to force the driver to always use device memory for physical storage.
+ * When this environment variable is set to a non-zero value, all contexts created in
+ * that process on devices that support managed memory have to be peer-to-peer compatible
+ * with each other. Context creation will fail if a context is created on a device that
+ * supports managed memory and is not peer-to-peer compatible with any of the other
+ * managed memory supporting devices on which contexts were previously created, even if
+ * those contexts have been destroyed. These environment variables are described
+ * in the CUDA programming guide under the "CUDA environment variables" section.
+ * - On ARM, managed memory is not available on discrete gpu with Drive PX-2.
+ *
+ * \param dptr     - Returned device pointer
+ * \param bytesize - Requested allocation size in bytes
+ * \param flags    - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync,
+ * ::cudaMallocManaged
+ */
+CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags);
+
+/**
+* \brief Registers a callback function to receive async notifications
+* 
+* Registers \p callbackFunc to receive async notifications.
+* 
+* The \p userData parameter is passed to the callback function at async notification time.  
+* Likewise, \p callback is also passed to the callback function to distinguish between
+* multiple registered callbacks.
+* 
+* The callback function being registered should be designed to return quickly (~10ms).  
+* Any long running tasks should be queued for execution on an application thread.
+* 
+* Callbacks may not call cuDeviceRegisterAsyncNotification or cuDeviceUnregisterAsyncNotification.
+* Doing so will result in ::CUDA_ERROR_NOT_PERMITTED. Async notification callbacks execute
+* in an undefined order and may be serialized.
+* 
+* Returns in \p *callback a handle representing the registered callback instance.
+* 
+* \param device - The device on which to register the callback
+* \param callbackFunc - The function to register as a callback
+* \param userData - A generic pointer to user data. This is passed into the callback function.
+* \param callback - A handle representing the registered callback instance
+* 
+* \return
+* ::CUDA_SUCCESS
+* ::CUDA_ERROR_NOT_SUPPORTED
+* ::CUDA_ERROR_INVALID_DEVICE
+* ::CUDA_ERROR_INVALID_VALUE
+* ::CUDA_ERROR_NOT_PERMITTED
+* ::CUDA_ERROR_UNKNOWN
+* \notefnerr
+* 
+* \sa
+* ::cuDeviceUnregisterAsyncNotification
+*/
+CUresult CUDAAPI cuDeviceRegisterAsyncNotification(CUdevice device, CUasyncCallback callbackFunc, void *userData, CUasyncCallbackHandle *callback);
+
+/**
+* \brief Unregisters an async notification callback
+* 
+* Unregisters \p callback so that the corresponding callback function will stop receiving
+* async notifications.
+* 
+* \param device - The device from which to remove \p callback.
+* \param callback - The callback instance to unregister from receiving async notifications.
+* 
+* \return
+* ::CUDA_SUCCESS
+* ::CUDA_ERROR_NOT_SUPPORTED
+* ::CUDA_ERROR_INVALID_DEVICE
+* ::CUDA_ERROR_INVALID_VALUE
+* ::CUDA_ERROR_NOT_PERMITTED
+* ::CUDA_ERROR_UNKNOWN
+* \notefnerr
+* 
+* \sa
+* ::cuDeviceRegisterAsyncNotification
+*/
+CUresult CUDAAPI cuDeviceUnregisterAsyncNotification(CUdevice device, CUasyncCallbackHandle callback);
+
+/**
+ * \brief Returns a handle to a compute device
+ *
+ * Returns in \p *device a device handle given a PCI bus ID string.
+ *
+ * \param dev      - Returned device handle
+ *
+ * \param pciBusId - String in one of the following forms:
+ * [domain]:[bus]:[device].[function]
+ * [domain]:[bus]:[device]
+ * [bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGet,
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetPCIBusId,
+ * ::cudaDeviceGetByPCIBusId
+ */
+CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId);
+
+/**
+ * \brief Returns a PCI Bus Id string for the device
+ *
+ * Returns an ASCII string identifying the device \p dev in the NULL-terminated
+ * string pointed to by \p pciBusId. \p len specifies the maximum length of the
+ * string that may be returned.
+ *
+ * \param pciBusId - Returned identifier string for the device in the following format
+ * [domain]:[bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
+ * pciBusId should be large enough to store 13 characters including the NULL-terminator.
+ *
+ * \param len      - Maximum length of string to store in \p name
+ *
+ * \param dev      - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGet,
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetByPCIBusId,
+ * ::cudaDeviceGetPCIBusId
+ */
+CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
+
+/**
+ * \brief Gets an interprocess handle for a previously allocated event
+ *
+ * Takes as input a previously allocated event. This event must have been
+ * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING
+ * flags set. This opaque handle may be copied into other processes and
+ * opened with ::cuIpcOpenEventHandle to allow efficient hardware
+ * synchronization between GPU work in different processes.
+ *
+ * After the event has been opened in the importing process,
+ * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and
+ * ::cuEventQuery may be used in either process. Performing operations
+ * on the imported event after the exported event has been freed
+ * with ::cuEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
+ * Users can test their device for IPC functionality by calling
+ * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+ *
+ * \param pHandle - Pointer to a user allocated CUipcEventHandle
+ *                    in which to return the opaque event handle
+ * \param event   - Event allocated with ::CU_EVENT_INTERPROCESS and
+ *                    ::CU_EVENT_DISABLE_TIMING flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuEventCreate,
+ * ::cuEventDestroy,
+ * ::cuEventSynchronize,
+ * ::cuEventQuery,
+ * ::cuStreamWaitEvent,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cudaIpcGetEventHandle
+ */
+CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
+
+/**
+ * \brief Opens an interprocess event handle for use in the current process
+ *
+ * Opens an interprocess event handle exported from another process with
+ * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like
+ * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified.
+ * This event must be freed with ::cuEventDestroy.
+ *
+ * Performing operations on the imported event after the exported event has
+ * been freed with ::cuEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
+ * Users can test their device for IPC functionality by calling
+ * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+ *
+ * \param phEvent - Returns the imported event
+ * \param handle  - Interprocess handle to open
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuEventCreate,
+ * ::cuEventDestroy,
+ * ::cuEventSynchronize,
+ * ::cuEventQuery,
+ * ::cuStreamWaitEvent,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cudaIpcOpenEventHandle
+ */
+CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle);
+
+/**
+ * \brief Gets an interprocess memory handle for an existing device memory
+ * allocation
+ *
+ * Takes a pointer to the base of an existing device memory allocation created
+ * with ::cuMemAlloc and exports it for use in another process. This is a
+ * lightweight operation and may be called multiple times on an allocation
+ * without adverse effects.
+ *
+ * If a region of memory is freed with ::cuMemFree and a subsequent call
+ * to ::cuMemAlloc returns memory with the same device address,
+ * ::cuIpcGetMemHandle will return a unique handle for the
+ * new memory.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
+ * Users can test their device for IPC functionality by calling
+ * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+ *
+ * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return
+ *                    the handle in.
+ * \param dptr    - Base pointer to previously allocated device memory
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cudaIpcGetMemHandle
+ */
+CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
+
+/**
+ * \brief Opens an interprocess memory handle exported from another process
+ * and returns a device pointer usable in the local process.
+ *
+ * Maps memory exported from another process with ::cuIpcGetMemHandle into
+ * the current device address space. For contexts on different devices
+ * ::cuIpcOpenMemHandle can attempt to enable peer access between the
+ * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is
+ * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag.
+ * ::cuDeviceCanAccessPeer can determine if a mapping is possible.
+ *
+ * Contexts that may open ::CUipcMemHandles are restricted in the following way.
+ * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened
+ * by one ::CUcontext per ::CUdevice per other process.
+ *
+ * If the memory handle has already been opened by the current context, the
+ * reference count on the handle is incremented by 1 and the existing device pointer
+ * is returned.
+ *
+ * Memory returned from ::cuIpcOpenMemHandle must be freed with
+ * ::cuIpcCloseMemHandle.
+ *
+ * Calling ::cuMemFree on an exported memory region before calling
+ * ::cuIpcCloseMemHandle in the importing context will result in undefined
+ * behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
+ * Users can test their device for IPC functionality by calling
+ * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+ *
+ * \param pdptr  - Returned device pointer
+ * \param handle - ::CUipcMemHandle to open
+ * \param Flags  - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_TOO_MANY_PEERS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \note No guarantees are made about the address returned in \p *pdptr.
+ * In particular, multiple processes may not receive the same address for the same \p handle.
+ *
+ * \sa
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cuCtxEnablePeerAccess,
+ * ::cuDeviceCanAccessPeer,
+ * ::cudaIpcOpenMemHandle
+ */
+CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
+
+/**
+ * \brief Attempts to close memory mapped with ::cuIpcOpenMemHandle
+ *
+ * Decrements the reference count of the memory returned by ::cuIpcOpenMemHandle by 1.
+ * When the reference count reaches 0, this API unmaps the memory. The original allocation
+ * in the exporting process as well as imported mappings in other processes
+ * will be unaffected.
+ *
+ * Any resources used to enable peer access will be freed if this is the
+ * last mapping using them.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is supported for compatibility purposes
+ * but not recommended as it comes with performance cost.
+ * Users can test their device for IPC functionality by calling
+ * ::cuapiDeviceGetAttribute with ::CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED
+ *
+ * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \sa
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cudaIpcCloseMemHandle
+ */
+CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr);
+
+/**
+ * \brief Registers an existing host memory range for use by CUDA
+ *
+ * Page-locks the memory range specified by \p p and \p bytesize and maps it
+ * for the device(s) as specified by \p Flags. This memory range also is added
+ * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate
+ * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed
+ * directly by the device, it can be read or written with much higher bandwidth
+ * than pageable memory that has not been registered.  Page-locking excessive
+ * amounts of memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to register staging areas for data exchange between
+ * host and device.
+ *
+ * On systems where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES 
+ * is true, ::cuMemHostRegister will not page-lock the memory range specified 
+ * by \p ptr but only populate unpopulated pages.
+ *
+ * The \p Flags parameter enables different options to be specified that
+ * affect the allocation, as follows.
+ *
+ * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be
+ *   considered as pinned memory by all CUDA contexts, not just the one that
+ *   performed the allocation.
+ *
+ * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address
+ *   space. The device pointer to the memory may be obtained by calling
+ *   ::cuMemHostGetDevicePointer().
+ *
+ * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some
+ *   I/O memory space, e.g. the PCI Express resource of a 3rd party device.
+ *
+ * - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to memory
+ *   that is considered read-only by the device.  On platforms without
+ *   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
+ *   required in order to register memory mapped to the CPU as read-only.  Support
+ *   for the use of this flag can be queried from the device attribute
+ *   ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag with
+ *   a current context associated with a device that does not have this attribute
+ *   set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED.
+ *
+ * All of these flags are orthogonal to one another: a developer may page-lock
+ * memory that is portable or mapped with no restrictions.
+ *
+ * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for
+ * devices that do not support mapped pinned memory. The failure is deferred
+ * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
+ * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
+ * can also be accessed from the device using the host pointer \p p.
+ * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
+ * match the original host pointer \p ptr and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
+ * will match the original pointer \p ptr. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only of the two pointers and not both.
+ *
+ * The memory page-locked by this function must be unregistered with
+ * ::cuMemHostUnregister().
+ *
+ * \param p        - Host pointer to memory to page-lock
+ * \param bytesize - Size in bytes of the address range to page-lock
+ * \param Flags    - Flags for allocation request
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemHostUnregister,
+ * ::cuMemHostGetFlags,
+ * ::cuMemHostGetDevicePointer,
+ * ::cudaHostRegister
+ */
+CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
+
+/**
+ * \brief Unregisters a memory range that was registered with cuMemHostRegister.
+ *
+ * Unmaps the memory range whose base address is specified by \p p, and makes
+ * it pageable again.
+ *
+ * The base address must be the same one specified to ::cuMemHostRegister().
+ *
+ * \param p - Host pointer to memory to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemHostRegister,
+ * ::cudaHostUnregister
+ */
+CUresult CUDAAPI cuMemHostUnregister(void *p);
+
+/**
+ * \brief Copies memory
+ *
+ * Copies data between two pointers.
+ * \p dst and \p src are base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ * Note that this function infers the type of the transfer (host to host, host to
+ *   device, device to device, or device to host) from the pointer values.  This
+ *   function is only allowed in contexts which support unified addressing.
+ *
+ * \param dst - Destination unified virtual address space pointer
+ * \param src - Source unified virtual address space pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol
+ */
+CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+
+/**
+ * \brief Copies device memory between two contexts
+ *
+ * Copies from device memory in one context to device memory in another
+ * context. \p dstDevice is the base device pointer of the destination memory
+ * and \p dstContext is the destination context.  \p srcDevice is the base
+ * device pointer of the source memory and \p srcContext is the source pointer.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice  - Destination device pointer
+ * \param dstContext - Destination context
+ * \param srcDevice  - Source device pointer
+ * \param srcContext - Source context
+ * \param ByteCount  - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpyPeer
+ */
+CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Host to Device
+ *
+ * Copies from host memory to device memory. \p dstDevice and \p srcHost are
+ * the base addresses of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyToSymbol
+ */
+CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Device to Host
+ *
+ * Copies from device to host memory. \p dstHost and \p srcDevice specify the
+ * base pointers of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination host pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyFromSymbol
+ */
+CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Device to Device
+ *
+ * Copies from device memory to device memory. \p dstDevice and \p srcDevice
+ * are the base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol
+ */
+CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Device to Array
+ *
+ * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ * specify the CUDA array handle and starting index of the destination data.
+ * \p srcDevice specifies the base pointer of the source. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyToArray
+ */
+CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Array to Device
+ *
+ * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the
+ * base pointer of the destination and must be naturally aligned with the CUDA
+ * array elements. \p srcArray and \p srcOffset specify the CUDA array handle
+ * and the offset in bytes into the array where the copy is to begin.
+ * \p ByteCount specifies the number of bytes to copy and must be evenly
+ * divisible by the array element size.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyFromArray
+ */
+CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Host to Array
+ *
+ * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ * specify the CUDA array handle and starting offset in bytes of the destination
+ * data.  \p pSrc specifies the base address of the source. \p ByteCount specifies
+ * the number of bytes to copy.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyToArray
+ */
+CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Array to Host
+ *
+ * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
+ * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
+ * array handle and starting offset in bytes of the source data.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination device pointer
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyFromArray
+ */
+CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Array to Array
+ *
+ * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray
+ * specify the handles of the destination and source CUDA arrays for the copy,
+ * respectively. \p dstOffset and \p srcOffset specify the destination and
+ * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of
+ * bytes to be copied. The size of the elements in the CUDA arrays need not be
+ * the same format, but the elements must be the same size; and count must be
+ * evenly divisible by that size.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyArrayToArray
+ */
+CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+/**
+ * \brief Copies memory for 2D arrays
+ *
+ * Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ * The ::CUDA_MEMCPY2D structure is defined as:
+ *
+ * \code
+   typedef struct CUDA_MEMCPY2D_st {
+      unsigned int srcXInBytes, srcY;
+      CUmemorytype srcMemoryType;
+          const void *srcHost;
+          CUdeviceptr srcDevice;
+          CUarray srcArray;
+          unsigned int srcPitch;
+
+      unsigned int dstXInBytes, dstY;
+      CUmemorytype dstMemoryType;
+          void *dstHost;
+          CUdeviceptr dstDevice;
+          CUarray dstArray;
+          unsigned int dstPitch;
+
+      unsigned int WidthInBytes;
+      unsigned int Height;
+   } CUDA_MEMCPY2D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ * specify the (host) base address of the source data and the bytes per row to
+ * apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ * specify the (device) base address of the source data and the bytes per row
+ * to apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ * ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ * ignored.
+ *
+ * - ::srcXInBytes and ::srcY specify the base address of the source data for
+ *   the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::dstXInBytes and ::dstY specify the base address of the destination data
+ *   for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+ *   the 2D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ *
+ * \par
+ * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ * (device to device, CUDA array to device, CUDA array to CUDA array),
+ * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
+ * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
+ * significantly slower in the cases where ::cuMemcpy2D() would have returned
+ * an error code.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,
+ * ::cudaMemcpy2DFromArray
+ */
+CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
+
+/**
+ * \brief Copies memory for 2D arrays
+ *
+ * Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ * The ::CUDA_MEMCPY2D structure is defined as:
+ *
+ * \code
+   typedef struct CUDA_MEMCPY2D_st {
+      unsigned int srcXInBytes, srcY;
+      CUmemorytype srcMemoryType;
+      const void *srcHost;
+      CUdeviceptr srcDevice;
+      CUarray srcArray;
+      unsigned int srcPitch;
+      unsigned int dstXInBytes, dstY;
+      CUmemorytype dstMemoryType;
+      void *dstHost;
+      CUdeviceptr dstDevice;
+      CUarray dstArray;
+      unsigned int dstPitch;
+      unsigned int WidthInBytes;
+      unsigned int Height;
+   } CUDA_MEMCPY2D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ * specify the (host) base address of the source data and the bytes per row to
+ * apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ * specify the (device) base address of the source data and the bytes per row
+ * to apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ * ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ * ignored.
+ *
+ * - ::srcXInBytes and ::srcY specify the base address of the source data for
+ *   the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::dstXInBytes and ::dstY specify the base address of the destination data
+ *   for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+ *   the 2D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ *
+ * \par
+ * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ * (device to device, CUDA array to device, CUDA array to CUDA array),
+ * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
+ * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
+ * significantly slower in the cases where ::cuMemcpy2D() would have returned
+ * an error code.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,
+ * ::cudaMemcpy2DFromArray
+ */
+CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
+
+/**
+ * \brief Copies memory for 3D arrays
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
+ *
+ * \code
+        typedef struct CUDA_MEMCPY3D_st {
+
+            unsigned int srcXInBytes, srcY, srcZ;
+            unsigned int srcLOD;
+            CUmemorytype srcMemoryType;
+                const void *srcHost;
+                CUdeviceptr srcDevice;
+                CUarray srcArray;
+                unsigned int srcPitch;  // ignored when src is array
+                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
+
+            unsigned int dstXInBytes, dstY, dstZ;
+            unsigned int dstLOD;
+            CUmemorytype dstMemoryType;
+                void *dstHost;
+                CUdeviceptr dstDevice;
+                CUarray dstArray;
+                unsigned int dstPitch;  // ignored when dst is array
+                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
+
+            unsigned int WidthInBytes;
+            unsigned int Height;
+            unsigned int Depth;
+        } CUDA_MEMCPY3D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
+ * ::srcHeight specify the (host) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
+ * ::srcHeight specify the (device) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
+ * ::srcHeight are ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data, the bytes per row,
+ * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data, the bytes per
+ * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
+ * ::dstHeight are ignored.
+ *
+ * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
+ *   data for the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
+ *   destination data for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
+ *   and depth of the 3D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcHeight must be greater than or equal to ::Height +
+ *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+ *
+ * \par
+ * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
+ *
+ * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
+ * set to 0.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy3D
+ */
+CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
+
+/**
+ * \brief Copies memory between contexts
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
+ * for documentation of its parameters.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpy3DPeer
+ */
+CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
+
+/**
+ * \brief Copies memory asynchronously
+ *
+ * Copies data between two pointers.
+ * \p dst and \p src are base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ * Note that this function infers the type of the transfer (host to host, host to
+ *   device, device to device, or device to host) from the pointer values.  This
+ *   function is only allowed in contexts which support unified addressing.
+ *
+ * \param dst       - Destination unified virtual address space pointer
+ * \param src       - Source unified virtual address space pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyToSymbolAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies device memory between two contexts asynchronously.
+ *
+ * Copies from device memory in one context to device memory in another
+ * context. \p dstDevice is the base device pointer of the destination memory
+ * and \p dstContext is the destination context.  \p srcDevice is the base
+ * device pointer of the source memory and \p srcContext is the source pointer.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice  - Destination device pointer
+ * \param dstContext - Destination context
+ * \param srcDevice  - Source device pointer
+ * \param srcContext - Source context
+ * \param ByteCount  - Size of memory copy in bytes
+ * \param hStream    - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpyPeerAsync
+ */
+CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Host to Device
+ *
+ * Copies from host memory to device memory. \p dstDevice and \p srcHost are
+ * the base addresses of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyToSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Device to Host
+ *
+ * Copies from device to host memory. \p dstHost and \p srcDevice specify the
+ * base pointers of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination host pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Device to Device
+ *
+ * Copies from device memory to device memory. \p dstDevice and \p srcDevice
+ * are the base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyToSymbolAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Host to Array
+ *
+ * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ * specify the CUDA array handle and starting offset in bytes of the
+ * destination data. \p srcHost specifies the base address of the source.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyToArrayAsync
+ */
+CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Array to Host
+ *
+ * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
+ * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
+ * array handle and starting offset in bytes of the source data.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination pointer
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyFromArrayAsync
+ */
+CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory for 2D arrays
+ *
+ * Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ * The ::CUDA_MEMCPY2D structure is defined as:
+ *
+ * \code
+   typedef struct CUDA_MEMCPY2D_st {
+      unsigned int srcXInBytes, srcY;
+      CUmemorytype srcMemoryType;
+      const void *srcHost;
+      CUdeviceptr srcDevice;
+      CUarray srcArray;
+      unsigned int srcPitch;
+      unsigned int dstXInBytes, dstY;
+      CUmemorytype dstMemoryType;
+      void *dstHost;
+      CUdeviceptr dstDevice;
+      CUarray dstArray;
+      unsigned int dstPitch;
+      unsigned int WidthInBytes;
+      unsigned int Height;
+   } CUDA_MEMCPY2D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ * specify the (host) base address of the source data and the bytes per row to
+ * apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ * specify the (device) base address of the source data and the bytes per row
+ * to apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ * ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ * ignored.
+ *
+ * - ::srcXInBytes and ::srcY specify the base address of the source data for
+ *   the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::dstXInBytes and ::dstY specify the base address of the destination data
+ *   for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+ *   the 2D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcHeight must be greater than or equal to ::Height +
+ *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+ *
+ * \par
+ * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ * (device to device, CUDA array to device, CUDA array to CUDA array),
+ * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch().
+ *
+ * \param pCopy   - Parameters for the memory copy
+ * \param hStream - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync
+ */
+CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+
+/**
+ * \brief Copies memory for 3D arrays
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
+ *
+ * \code
+        typedef struct CUDA_MEMCPY3D_st {
+
+            unsigned int srcXInBytes, srcY, srcZ;
+            unsigned int srcLOD;
+            CUmemorytype srcMemoryType;
+                const void *srcHost;
+                CUdeviceptr srcDevice;
+                CUarray srcArray;
+                unsigned int srcPitch;  // ignored when src is array
+                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
+
+            unsigned int dstXInBytes, dstY, dstZ;
+            unsigned int dstLOD;
+            CUmemorytype dstMemoryType;
+                void *dstHost;
+                CUdeviceptr dstDevice;
+                CUarray dstArray;
+                unsigned int dstPitch;  // ignored when dst is array
+                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
+
+            unsigned int WidthInBytes;
+            unsigned int Height;
+            unsigned int Depth;
+        } CUDA_MEMCPY3D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
+ * ::srcHeight specify the (host) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
+ * ::srcHeight specify the (device) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
+ * ::srcHeight are ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data, the bytes per row,
+ * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data, the bytes per
+ * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
+ * ::dstHeight are ignored.
+ *
+ * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
+ *   data for the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
+ *   destination data for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
+ *   and depth of the 3D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcHeight must be greater than or equal to ::Height +
+ *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+ *
+ * \par
+ * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
+ *
+ * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
+ * set to 0.
+ *
+ * \param pCopy - Parameters for the memory copy
+ * \param hStream - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpy3DAsync
+ */
+CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
+
+/**
+ * \brief Copies memory between contexts asynchronously.
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
+ * for documentation of its parameters.
+ *
+ * \param pCopy - Parameters for the memory copy
+ * \param hStream - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpy3DPeerAsync
+ */
+CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
+
+/**
+ * \brief Performs a batch of memory copies asynchronously.
+ *
+ * Performs a batch of memory copies. The batch as a whole executes in stream order but copies within a
+ * batch are not guaranteed to execute in any specific order. This API only supports pointer-to-pointer copies.
+ * For copies involving CUDA arrays, please see ::cuMemcpy3DBatchAsync.
+ *
+ * Performs memory copies from source buffers specified in \p srcs to destination buffers specified in \p dsts.
+ * The size of each copy is specified in \p sizes. All three arrays must be of the same length as specified
+ * by \p count. Since there are no ordering guarantees for copies within a batch, specifying any dependent copies
+ * within a batch will result in undefined behavior.
+ *
+ * Every copy in the batch has to be associated with a set of attributes specified in the \p attrs array.
+ * Each entry in this array can apply to more than one copy. This can be done by specifying in the \p attrsIdxs array,
+ * the index of the first copy that the corresponding entry in the \p attrs array applies to. Both \p attrs and
+ * \p attrsIdxs must be of the same length as specified by \p numAttrs. For example, if a batch has 10 copies listed
+ * in dst/src/sizes, the first 6 of which have one set of attributes and the remaining 4 another, then \p numAttrs
+ * will be 2, \p attrsIdxs will be {0, 6} and \p attrs will contains the two sets of attributes. Note that the first entry
+ * in \p attrsIdxs must always be 0. Also, each entry must be greater than the previous entry and the last entry should be
+ * less than \p count. Furthermore, \p numAttrs must be lesser than or equal to \p count.
+ *
+ * The ::CUmemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated
+ * with the attribute. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM, then the source will
+ * be accessed in stream order. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL then
+ * it indicates that access to the source pointer can be out of stream order and all accesses must be complete before
+ * the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior
+ * operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope
+ * that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the
+ * need for the user to synchronize the stream after the API call. If the source access order is set to
+ * ::CU_MEMCPY_SRC_ACCESS_ORDER_ANY then it indicates that access to the source pointer can be out of stream order and the
+ * accesses can happen even after the API call returns. This flag is suited for host pointers allocated
+ * outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory.
+ * Specifying this flag allows the driver to optimize the copy on certain platforms. Each memcpy operation in the batch must
+ * have a valid ::CUmemcpyAttributes corresponding to it including the appropriate srcAccessOrder setting, otherwise the API
+ * will return ::CUDA_ERROR_INVALID_VALUE.
+ *
+ * The ::CUmemcpyAttributes::srcLocHint and ::CUmemcpyAttributes::dstLocHint allows applications to specify hint locations
+ * for operands of a copy when the operand doesn't have a fixed location. That is, these hints are
+ * only applicable for managed memory pointers on devices where ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS is true or
+ * system-allocated pageable memory on devices where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS is true.
+ * For other cases, these hints are ignored.
+ *
+ * The ::CUmemcpyAttributes::flags field can be used to specify certain flags for copies. Setting the
+ * ::CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE flag indicates that the associated copies should preferably overlap with
+ * any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy.
+ *
+ * If any error is encountered while parsing the batch, the index within the batch where the error was encountered
+ * will be returned in \p failIdx. 
+ *
+ * \param dsts          - Array of destination pointers.
+ * \param srcs          - Array of memcpy source pointers.
+ * \param sizes         - Array of sizes for memcpy operations.
+ * \param count         - Size of \p dsts, \p srcs and \p sizes arrays
+ * \param attrs         - Array of memcpy attributes. 
+ * \param attrsIdxs     - Array of indices to specify which copies each entry in the \p attrs array applies to.
+                          The attributes specified in attrs[k] will be applied to copies starting from attrsIdxs[k]
+                          through attrsIdxs[k+1] - 1. Also attrs[numAttrs-1] will apply to copies starting from
+                          attrsIdxs[numAttrs-1] through count - 1.
+ * \param numAttrs      - Size of \p attrs and \p attrsIdxs arrays.
+ * \param failIdx       - Pointer to a location to return the index of the copy where a failure was encountered.
+                          The value will be SIZE_MAX if the error doesn't pertain to any specific copy.
+ * \param hStream       - The stream to enqueue the operations in. Must not be legacy NULL stream.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_DEINITIALIZED
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_async
+ * \note_memcpy
+ */
+CUresult CUDAAPI cuMemcpyBatchAsync(CUdeviceptr *dsts, CUdeviceptr *srcs, size_t *sizes, size_t count,
+                                    CUmemcpyAttributes *attrs, size_t *attrsIdxs, size_t numAttrs,
+                                    size_t *failIdx, CUstream hStream);
+
+/**
+ * \brief Performs a batch of 3D memory copies asynchronously.
+ *
+ * Performs a batch of memory copies. The batch as a whole executes in stream order but copies within a
+ * batch are not guaranteed to execute in any specific order. Note that this means specifying any dependent
+ * copies within a batch will result in undefined behavior.
+ *
+ * Performs memory copies as specified in the \p opList array. The length of this array is specified in \p numOps.
+ * Each entry in this array describes a copy operation. This includes among other things, the source and destination
+ * operands for the copy as specified in ::CUDA_MEMCPY3D_BATCH_OP::src and ::CUDA_MEMCPY3D_BATCH_OP::dst respectively.
+ * The source and destination operands of a copy can either be a pointer or a CUDA array. The width, height and depth
+ * of a copy is specified in ::CUDA_MEMCPY3D_BATCH_OP::extent. The width, height and depth of a copy are specified in
+ * elements and must not be zero. For pointer-to-pointer copies, the element size is considered to be 1. For pointer
+ * to CUDA array or vice versa copies, the element size is determined by the CUDA array. For CUDA array to CUDA array copies,
+ * the element size of the two CUDA arrays must match.
+ *
+ * For a given operand, if ::CUmemcpy3DOperand::type is specified as ::CU_MEMCPY_OPERAND_TYPE_POINTER, then
+ * ::CUmemcpy3DOperand::op::ptr will be used. The ::CUmemcpy3DOperand::op::ptr::ptr field must contain the pointer where
+ * the copy should begin. The ::CUmemcpy3DOperand::op::ptr::rowLength field specifies the length of each row in elements and
+ * must either be zero or be greater than or equal to the width of the copy specified in ::CUDA_MEMCPY3D_BATCH_OP::extent::width.
+ * The ::CUmemcpy3DOperand::op::ptr::layerHeight field specifies the height of each layer and must either be zero or be greater than
+ * or equal to the height of the copy specified in ::CUDA_MEMCPY3D_BATCH_OP::extent::height. When either of these values is zero,
+ * that aspect of the operand is considered to be tightly packed according to the copy extent. For managed memory pointers on devices where
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS is true or system-allocated pageable memory on devices where
+ * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS is true, the ::CUmemcpy3DOperand::op::ptr::locHint field can be used to hint
+ * the location of the operand.
+ *
+ * If an operand's type is specified as ::CU_MEMCPY_OPERAND_TYPE_ARRAY, then ::CUmemcpy3DOperand::op::array will be used.
+ * The ::CUmemcpy3DOperand::op::array::array field specifies the CUDA array and ::CUmemcpy3DOperand::op::array::offset specifies
+ * the 3D offset into that array where the copy begins.
+ *
+ * The ::CUmemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated
+ * with the attribute. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM, then the source will
+ * be accessed in stream order. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL then
+ * it indicates that access to the source pointer can be out of stream order and all accesses must be complete before
+ * the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior
+ * operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope
+ * that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the
+ * need for the user to synchronize the stream after the API call. If the source access order is set to
+ * ::CU_MEMCPY_SRC_ACCESS_ORDER_ANY then it indicates that access to the source pointer can be out of stream order and the
+ * accesses can happen even after the API call returns. This flag is suited for host pointers allocated
+ * outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory.
+ * Specifying this flag allows the driver to optimize the copy on certain platforms. Each memcopy operation in \p opList must
+ * have a valid srcAccessOrder setting, otherwise this API will return ::CUDA_ERROR_INVALID_VALUE.
+ *
+ * The ::CUmemcpyAttributes::flags field can be used to specify certain flags for copies. Setting the
+ * ::CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE flag indicates that the associated copies should preferably overlap with
+ * any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy.
+ *
+ * If any error is encountered while parsing the batch, the index within the batch where the error was encountered
+ * will be returned in \p failIdx. 
+ *
+ * \param numOps     - Total number of memcpy operations. 
+ * \param opList     - Array of size \p numOps containing the actual memcpy operations. 
+ * \param failIdx    - Pointer to a location to return the index of the copy where a failure was encountered.
+ *                     The value will be SIZE_MAX if the error doesn't pertain to any specific copy.
+ * \param flags      - Flags for future use, must be zero now.
+ * \param hStream    - The stream to enqueue the operations in. Must not be default NULL stream.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_DEINITIALIZED
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_async
+ * \note_memcpy
+ */
+CUresult CUDAAPI cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP *opList,
+                                      size_t *failIdx, unsigned long long flags, CUstream hStream);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the memory range of \p N 8-bit values to the specified value
+ * \p uc.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param uc        - Value to set
+ * \param N         - Number of elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset
+ */
+CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the memory range of \p N 16-bit values to the specified value
+ * \p us. The \p dstDevice pointer must be two byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param us        - Value to set
+ * \param N         - Number of elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset
+ */
+CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the memory range of \p N 32-bit values to the specified value
+ * \p ui. The \p dstDevice pointer must be four byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param ui        - Value to set
+ * \param N         - Number of elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32Async,
+ * ::cudaMemset
+ */
+CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the 2D memory range of \p Width 8-bit values to the specified value
+ * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param uc        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2D
+ */
+CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the 2D memory range of \p Width 16-bit values to the specified value
+ * \p us. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be two byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param us        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2D
+ */
+CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the 2D memory range of \p Width 32-bit values to the specified value
+ * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be four byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param ui        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2D
+ */
+CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the memory range of \p N 8-bit values to the specified value
+ * \p uc.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param uc        - Value to set
+ * \param N         - Number of elements
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemsetAsync
+ */
+CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the memory range of \p N 16-bit values to the specified value
+ * \p us. The \p dstDevice pointer must be two byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param us        - Value to set
+ * \param N         - Number of elements
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemsetAsync
+ */
+CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the memory range of \p N 32-bit values to the specified value
+ * \p ui. The \p dstDevice pointer must be four byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param ui        - Value to set
+ * \param N         - Number of elements
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32,
+ * ::cudaMemsetAsync
+ */
+CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the 2D memory range of \p Width 8-bit values to the specified value
+ * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param uc        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2DAsync
+ */
+CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the 2D memory range of \p Width 16-bit values to the specified value
+ * \p us. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be two byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param us        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2DAsync
+ */
+CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the 2D memory range of \p Width 32-bit values to the specified value
+ * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be four byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param ui        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2DAsync
+ */
+CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+
+/**
+ * \brief Creates a 1D or 2D CUDA array
+ *
+ * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure
+ * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
+ * The ::CUDA_ARRAY_DESCRIPTOR is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned int Width;
+        unsigned int Height;
+        CUarray_format Format;
+        unsigned int NumChannels;
+    } CUDA_ARRAY_DESCRIPTOR;
+ * \endcode
+ * where:
+ *
+ * - \p Width, and \p Height are the width, and height of the CUDA array (in
+ * elements); the CUDA array is one-dimensional if height is 0, two-dimensional
+ * otherwise;
+ * - ::Format specifies the format of the elements; ::CUarray_format is
+ * defined as:
+ * \code
+    typedef enum CUarray_format_enum {
+        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+        CU_AD_FORMAT_HALF = 0x10,
+        CU_AD_FORMAT_FLOAT = 0x20,
+        CU_AD_FORMAT_NV12 = 0xb0, 
+        CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, 
+        CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, 
+        CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, 
+        CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, 
+        CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, 
+        CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, 
+        CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, 
+        CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, 
+        CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, 
+        CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
+        CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
+        CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
+        CU_AD_FORMAT_BC1_UNORM = 0x91,
+        CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
+        CU_AD_FORMAT_BC2_UNORM = 0x93,
+        CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
+        CU_AD_FORMAT_BC3_UNORM = 0x95,
+        CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
+        CU_AD_FORMAT_BC4_UNORM = 0x97,
+        CU_AD_FORMAT_BC4_SNORM = 0x98,
+        CU_AD_FORMAT_BC5_UNORM = 0x99,
+        CU_AD_FORMAT_BC5_SNORM = 0x9a,
+        CU_AD_FORMAT_BC6H_UF16 = 0x9b,
+        CU_AD_FORMAT_BC6H_SF16 = 0x9c,
+        CU_AD_FORMAT_BC7_UNORM = 0x9d,
+        CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
+        CU_AD_FORMAT_P010 = 0x9f,
+        CU_AD_FORMAT_P016 = 0xa1,
+        CU_AD_FORMAT_NV16 = 0xa2,
+        CU_AD_FORMAT_P210 = 0xa3,
+        CU_AD_FORMAT_P216 = 0xa4,
+        CU_AD_FORMAT_YUY2 = 0xa5,
+        CU_AD_FORMAT_Y210 = 0xa6,
+        CU_AD_FORMAT_Y216 = 0xa7,
+        CU_AD_FORMAT_AYUV = 0xa8,
+        CU_AD_FORMAT_Y410 = 0xa9,
+        CU_AD_FORMAT_Y416 = 0xb1,
+        CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
+        CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
+        CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
+        CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
+        CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
+   } CUarray_format;
+ *  \endcode
+ * - \p NumChannels specifies the number of packed components per CUDA array
+ * element; it may be 1, 2, or 4;
+ *
+ * Here are examples of CUDA array descriptions:
+ *
+ * Description for a CUDA array of 2048 floats:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 2048;
+    desc.Height = 1;
+ * \endcode
+ *
+ * Description for a 64 x 64 CUDA array of floats:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 64;
+    desc.Height = 64;
+ * \endcode
+ *
+ * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit
+ * float16's:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_HALF;
+    desc.NumChannels = 4;
+    desc.Width = width;
+    desc.Height = height;
+ * \endcode
+ *
+ * Description for a \p width x \p height CUDA array of 16-bit elements, each
+ * of which is two 8-bit unsigned chars:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR arrayDesc;
+    desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+    desc.NumChannels = 2;
+    desc.Width = width;
+    desc.Height = height;
+ * \endcode
+ *
+ * \param pHandle        - Returned array
+ * \param pAllocateArray - Array descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMallocArray
+ */
+CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
+
+/**
+ * \brief Get a 1D or 2D CUDA array descriptor
+ *
+ * Returns in \p *pArrayDescriptor a descriptor containing information on the
+ * format and dimensions of the CUDA array \p hArray. It is useful for
+ * subroutines that have been passed a CUDA array, but need to know the CUDA
+ * array parameters for validation or other purposes.
+ *
+ * \param pArrayDescriptor - Returned array descriptor
+ * \param hArray           - Array to get descriptor of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaArrayGetInfo
+ */
+CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+
+/**
+ * \brief Returns the layout properties of a sparse CUDA array
+ *
+ * Returns the layout properties of a sparse CUDA array in \p sparseProperties
+ * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE 
+ * ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL,
+ * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array. Otherwise, it will be zero.
+ * Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero.
+ * Note that the \p array must have been allocated using ::cuArrayCreate or ::cuArray3DCreate. For CUDA arrays obtained
+ * using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. Instead, ::cuMipmappedArrayGetSparseProperties 
+ * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES
+ * \param[in] array - CUDA array to get the sparse properties of
+ * \sa ::cuMipmappedArrayGetSparseProperties, ::cuMemMapArrayAsync
+ */
+CUresult CUDAAPI cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array);
+
+/**
+ * \brief Returns the layout properties of a sparse CUDA mipmapped array
+ *
+ * Returns the sparse array layout properties in \p sparseProperties
+ * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE 
+ * ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the
+ * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth
+ * is less than that of the tile.
+ * For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL,
+ * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined. 
+ * Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer.
+ * The returned value of ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES
+ * \param[in] mipmap - CUDA mipmapped array to get the sparse properties of
+ * \sa ::cuArrayGetSparseProperties, ::cuMemMapArrayAsync
+ */
+CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap);
+
+/**
+ * \brief Returns the memory requirements of a CUDA array
+ *
+ * Returns the memory requirements of a CUDA array in \p memoryRequirements
+ * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING
+ * ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size 
+ * represents the total size of the CUDA array.
+ * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment 
+ * represents the alignment necessary for mapping the CUDA array.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS
+ * \param[in] array - CUDA array to get the memory requirements of
+ * \param[in] device - Device to get the memory requirements for
+ * \sa ::cuMipmappedArrayGetMemoryRequirements, ::cuMemMapArrayAsync
+ */
+CUresult CUDAAPI cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUarray array, CUdevice device);
+ 
+/**
+ * \brief Returns the memory requirements of a CUDA mipmapped array
+ *
+ * Returns the memory requirements of a CUDA mipmapped array in \p memoryRequirements
+ * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING
+ * ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size 
+ * represents the total size of the CUDA mipmapped array.
+ * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment 
+ * represents the alignment necessary for mapping the CUDA mipmapped  
+ * array.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS
+ * \param[in] mipmap - CUDA mipmapped array to get the memory requirements of
+ * \param[in] device - Device to get the memory requirements for
+ * \sa ::cuArrayGetMemoryRequirements, ::cuMemMapArrayAsync
+ */
+CUresult CUDAAPI cuMipmappedArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUmipmappedArray mipmap, CUdevice device);
+
+/**
+ * \brief Gets a CUDA array plane from a CUDA array
+ *
+ * Returns in \p pPlaneArray a CUDA array that represents a single format plane
+ * of the CUDA array \p hArray.
+ *
+ * If \p planeIdx is greater than the maximum number of planes in this array or if the array does
+ * not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns
+ * a CUDA array of the same size as \p hArray but with one channel and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format.
+ * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width
+ * of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format.
+ *
+ * \param pPlaneArray   - Returned CUDA array referenced by the \p planeIdx
+ * \param hArray        - Multiplanar CUDA array
+ * \param planeIdx      - Plane index
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuArrayCreate,
+ * ::cudaArrayGetPlane
+ */
+CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx);
+
+/**
+ * \brief Destroys a CUDA array
+ *
+ * Destroys the CUDA array \p hArray.
+ *
+ * \param hArray - Array to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ARRAY_IS_MAPPED,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaFreeArray
+ */
+CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
+
+/**
+ * \brief Creates a 3D CUDA array
+ *
+ * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
+ * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
+ * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned int Width;
+        unsigned int Height;
+        unsigned int Depth;
+        CUarray_format Format;
+        unsigned int NumChannels;
+        unsigned int Flags;
+    } CUDA_ARRAY3D_DESCRIPTOR;
+ * \endcode
+ * where:
+ *
+ * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
+ * CUDA array (in elements); the following types of CUDA arrays can be allocated:
+ *     - A 1D array is allocated if \p Height and \p Depth extents are both zero.
+ *     - A 2D array is allocated if only \p Depth extent is zero.
+ *     - A 3D array is allocated if all three extents are non-zero.
+ *     - A 1D layered CUDA array is allocated if only \p Height is zero and the
+ *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A 2D layered CUDA array is allocated if all three extents are non-zero and
+ *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A cubemap CUDA array is allocated if all three extents are non-zero and the
+ *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
+ *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
+ *       where the six layers represent the six faces of a cube. The order of the six
+ *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
+ *     - A cubemap layered CUDA array is allocated if all three extents are non-zero,
+ *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
+ *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
+ *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
+ *       consists of a collection of cubemaps. The first six layers represent the first
+ *       cubemap, the next six layers form the second cubemap, and so on.
+ *
+ * - ::Format specifies the format of the elements; ::CUarray_format is
+ * defined as:
+ * \code
+    typedef enum CUarray_format_enum {
+        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+        CU_AD_FORMAT_HALF = 0x10,
+        CU_AD_FORMAT_FLOAT = 0x20,
+        CU_AD_FORMAT_NV12 = 0xb0, 
+        CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, 
+        CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, 
+        CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, 
+        CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, 
+        CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, 
+        CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, 
+        CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, 
+        CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, 
+        CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, 
+        CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
+        CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
+        CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
+        CU_AD_FORMAT_BC1_UNORM = 0x91,
+        CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
+        CU_AD_FORMAT_BC2_UNORM = 0x93,
+        CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
+        CU_AD_FORMAT_BC3_UNORM = 0x95,
+        CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
+        CU_AD_FORMAT_BC4_UNORM = 0x97,
+        CU_AD_FORMAT_BC4_SNORM = 0x98,
+        CU_AD_FORMAT_BC5_UNORM = 0x99,
+        CU_AD_FORMAT_BC5_SNORM = 0x9a,
+        CU_AD_FORMAT_BC6H_UF16 = 0x9b,
+        CU_AD_FORMAT_BC6H_SF16 = 0x9c,
+        CU_AD_FORMAT_BC7_UNORM = 0x9d,
+        CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
+        CU_AD_FORMAT_P010 = 0x9f,
+        CU_AD_FORMAT_P016 = 0xa1,
+        CU_AD_FORMAT_NV16 = 0xa2,
+        CU_AD_FORMAT_P210 = 0xa3,
+        CU_AD_FORMAT_P216 = 0xa4,
+        CU_AD_FORMAT_YUY2 = 0xa5,
+        CU_AD_FORMAT_Y210 = 0xa6,
+        CU_AD_FORMAT_Y216 = 0xa7,
+        CU_AD_FORMAT_AYUV = 0xa8,
+        CU_AD_FORMAT_Y410 = 0xa9,
+        CU_AD_FORMAT_Y416 = 0xb1,
+        CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
+        CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
+        CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
+        CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
+        CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
+    } CUarray_format;
+ *  \endcode
+ *
+ * - \p NumChannels specifies the number of packed components per CUDA array
+ * element; it may be 1, 2, or 4;
+ *
+ * - ::Flags may be set to
+ *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set,
+ *     \p Depth specifies the number of layers, not the depth of a 3D array.
+ *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array.
+ *     If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array
+ *     to a surface reference.
+ *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be
+ *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
+ *     then \p Depth must be a multiple of six.
+ *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather.
+ *     Texture gather can only be performed on 2D CUDA arrays.
+ *
+ * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
+ * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
+ * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH.
+ *
+ * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag
+ * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH
+ * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case.
+ *
+ * <table>
+ * <tr><td><b>CUDA array type</b></td>
+ * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
+ * (depth range)}</b></td>
+ * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
+ * {(width range in elements), (height range), (depth range)}</b></td></tr>
+ * <tr><td>1D</td>
+ * <td><small>{ (1,TEXTURE1D_WIDTH), 0, 0 }</small></td>
+ * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
+ * <tr><td>2D</td>
+ * <td><small>{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }</small></td>
+ * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
+ * <tr><td>3D</td>
+ * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
+ * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
+ * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
+ * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
+ * (1,SURFACE3D_DEPTH) }</small></td></tr>
+ * <tr><td>1D Layered</td>
+ * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
+ * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
+ * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>2D Layered</td>
+ * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
+ * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
+ * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>Cubemap</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
+ * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
+ * <tr><td>Cubemap Layered</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
+ * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
+ * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
+ * </table>
+ *
+ * Here are examples of CUDA array descriptions:
+ *
+ * Description for a CUDA array of 2048 floats:
+ * \code
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 2048;
+    desc.Height = 0;
+    desc.Depth = 0;
+ * \endcode
+ *
+ * Description for a 64 x 64 CUDA array of floats:
+ * \code
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 64;
+    desc.Height = 64;
+    desc.Depth = 0;
+ * \endcode
+ *
+ * Description for a \p width x \p height x \p depth CUDA array of 64-bit,
+ * 4x16-bit float16's:
+ * \code
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_HALF;
+    desc.NumChannels = 4;
+    desc.Width = width;
+    desc.Height = height;
+    desc.Depth = depth;
+ * \endcode
+ *
+ * \param pHandle        - Returned array
+ * \param pAllocateArray - 3D array descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMalloc3DArray
+ */
+CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
+
+/**
+ * \brief Get a 3D CUDA array descriptor
+ *
+ * Returns in \p *pArrayDescriptor a descriptor containing information on the
+ * format and dimensions of the CUDA array \p hArray. It is useful for
+ * subroutines that have been passed a CUDA array, but need to know the CUDA
+ * array parameters for validation or other purposes.
+ *
+ * This function may be called on 1D and 2D arrays, in which case the \p Height
+ * and/or \p Depth members of the descriptor struct will be set to 0.
+ *
+ * \param pArrayDescriptor - Returned 3D array descriptor
+ * \param hArray           - 3D array to get descriptor of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaArrayGetInfo
+ */
+CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+
+/**
+ * \brief Creates a CUDA mipmapped array
+ *
+ * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
+ * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle.
+ * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is
+ * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))].
+ *
+ * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned int Width;
+        unsigned int Height;
+        unsigned int Depth;
+        CUarray_format Format;
+        unsigned int NumChannels;
+        unsigned int Flags;
+    } CUDA_ARRAY3D_DESCRIPTOR;
+ * \endcode
+ * where:
+ *
+ * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
+ * CUDA array (in elements); the following types of CUDA arrays can be allocated:
+ *     - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero.
+ *     - A 2D mipmapped array is allocated if only \p Depth extent is zero.
+ *     - A 3D mipmapped array is allocated if all three extents are non-zero.
+ *     - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the
+ *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and
+ *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the
+ *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
+ *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
+ *       where the six layers represent the six faces of a cube. The order of the six
+ *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
+ *     - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero,
+ *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
+ *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
+ *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
+ *       consists of a collection of cubemaps. The first six layers represent the first
+ *       cubemap, the next six layers form the second cubemap, and so on.
+ *
+ * - ::Format specifies the format of the elements; ::CUarray_format is
+ * defined as:
+ * \code
+    typedef enum CUarray_format_enum {
+        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+        CU_AD_FORMAT_HALF = 0x10,
+        CU_AD_FORMAT_FLOAT = 0x20,
+        CU_AD_FORMAT_NV12 = 0xb0, 
+        CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, 
+        CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, 
+        CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, 
+        CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, 
+        CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, 
+        CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, 
+        CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, 
+        CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, 
+        CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, 
+        CU_AD_FORMAT_SNORM_INT16X1 = 0xc9,
+        CU_AD_FORMAT_SNORM_INT16X2 = 0xca,
+        CU_AD_FORMAT_SNORM_INT16X4 = 0xcb,
+        CU_AD_FORMAT_BC1_UNORM = 0x91,
+        CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92,
+        CU_AD_FORMAT_BC2_UNORM = 0x93,
+        CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94,
+        CU_AD_FORMAT_BC3_UNORM = 0x95,
+        CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96,
+        CU_AD_FORMAT_BC4_UNORM = 0x97,
+        CU_AD_FORMAT_BC4_SNORM = 0x98,
+        CU_AD_FORMAT_BC5_UNORM = 0x99,
+        CU_AD_FORMAT_BC5_SNORM = 0x9a,
+        CU_AD_FORMAT_BC6H_UF16 = 0x9b,
+        CU_AD_FORMAT_BC6H_SF16 = 0x9c,
+        CU_AD_FORMAT_BC7_UNORM = 0x9d,
+        CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e,
+        CU_AD_FORMAT_P010 = 0x9f,
+        CU_AD_FORMAT_P016 = 0xa1,
+        CU_AD_FORMAT_NV16 = 0xa2,
+        CU_AD_FORMAT_P210 = 0xa3,
+        CU_AD_FORMAT_P216 = 0xa4,
+        CU_AD_FORMAT_YUY2 = 0xa5,
+        CU_AD_FORMAT_Y210 = 0xa6,
+        CU_AD_FORMAT_Y216 = 0xa7,
+        CU_AD_FORMAT_AYUV = 0xa8,
+        CU_AD_FORMAT_Y410 = 0xa9,
+        CU_AD_FORMAT_Y416 = 0xb1,
+        CU_AD_FORMAT_Y444_PLANAR8 = 0xb2,
+        CU_AD_FORMAT_Y444_PLANAR10 = 0xb3,
+        CU_AD_FORMAT_YUV444_8bit_SemiPlanar = 0xb4,
+        CU_AD_FORMAT_YUV444_16bit_SemiPlanar = 0xb5,
+        CU_AD_FORMAT_UNORM_INT_101010_2 = 0x50,
+    } CUarray_format;
+ *  \endcode
+ *
+ * - \p NumChannels specifies the number of packed components per CUDA array
+ * element; it may be 1, 2, or 4;
+ *
+ * - ::Flags may be set to
+ *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set,
+ *     \p Depth specifies the number of layers, not the depth of a 3D array.
+ *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of
+ *     the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to
+ *     bind a mipmap level of the CUDA mipmapped array to a surface reference.
+  *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be
+ *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
+ *     then \p Depth must be a multiple of six.
+ *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather.
+ *     Texture gather can only be performed on 2D CUDA mipmapped arrays.
+ *
+ * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
+ * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
+ * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH.
+ *
+ * <table>
+ * <tr><td><b>CUDA array type</b></td>
+ * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
+ * (depth range)}</b></td>
+ * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
+ * {(width range in elements), (height range), (depth range)}</b></td></tr>
+ * <tr><td>1D</td>
+ * <td><small>{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }</small></td>
+ * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
+ * <tr><td>2D</td>
+ * <td><small>{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }</small></td>
+ * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
+ * <tr><td>3D</td>
+ * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
+ * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
+ * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
+ * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
+ * (1,SURFACE3D_DEPTH) }</small></td></tr>
+ * <tr><td>1D Layered</td>
+ * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
+ * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
+ * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>2D Layered</td>
+ * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
+ * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
+ * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>Cubemap</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
+ * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
+ * <tr><td>Cubemap Layered</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
+ * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
+ * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
+ * </table>
+ *
+ *
+ * \param pHandle             - Returned mipmapped array
+ * \param pMipmappedArrayDesc - mipmapped array descriptor
+ * \param numMipmapLevels     - Number of mipmap levels
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMipmappedArrayDestroy,
+ * ::cuMipmappedArrayGetLevel,
+ * ::cuArrayCreate,
+ * ::cudaMallocMipmappedArray
+ */
+CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels);
+
+/**
+ * \brief Gets a mipmap level of a CUDA mipmapped array
+ *
+ * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level
+ * of the CUDA mipmapped array \p hMipmappedArray.
+ *
+ * If \p level is greater than the maximum number of levels in this mipmapped array,
+ * ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param pLevelArray     - Returned mipmap level CUDA array
+ * \param hMipmappedArray - CUDA mipmapped array
+ * \param level           - Mipmap level
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMipmappedArrayCreate,
+ * ::cuMipmappedArrayDestroy,
+ * ::cuArrayCreate,
+ * ::cudaGetMipmappedArrayLevel
+ */
+CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
+
+/**
+ * \brief Destroys a CUDA mipmapped array
+ *
+ * Destroys the CUDA mipmapped array \p hMipmappedArray.
+ *
+ * \param hMipmappedArray - Mipmapped array to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ARRAY_IS_MAPPED,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMipmappedArrayCreate,
+ * ::cuMipmappedArrayGetLevel,
+ * ::cuArrayCreate,
+ * ::cudaFreeMipmappedArray
+ */
+CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
+
+/** 
+* \brief Retrieve handle for an address range 
+* 
+* Get a handle of the specified type to an address range. The address range
+* must have been obtained by a prior call to either ::cuMemAlloc or ::cuMemAddressReserve.
+* If the address range was obtained via ::cuMemAddressReserve, it must also be fully mapped via ::cuMemMap.
+* The address range must have been obtained by a prior call to either ::cuMemAllocHost or
+* ::cuMemHostAlloc on Tegra.
+* 
+* Users must ensure the \p dptr and \p size are aligned to the host page size.
+* 
+* When requesting CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+* users are expected to query for dma_buf support for the platform
+* by using ::CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED device attribute before calling
+* this API. The \p handle will be interpreted as a pointer to an integer to store the dma_buf file descriptor.
+* Users must ensure the entire address range is backed and mapped when
+* the address range is allocated by ::cuMemAddressReserve. All the physical
+* allocations backing the address range must be resident on the same device and
+* have identical allocation properties. Users are also expected to retrieve a
+* new handle every time the underlying physical allocation(s) corresponding
+* to a previously queried VA range are changed.
+*
+* For CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, users may set
+* flags to ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE. Which when set on a
+* supported platform, will give a DMA_BUF handle mapped via PCIE BAR1 or will
+* return an error otherwise.
+* 
+* \param[out] handle     - Pointer to the location where the returned handle will be stored. 
+* \param[in] dptr        - Pointer to a valid CUDA device allocation. Must be aligned to host page size.
+* \param[in] size        - Length of the address range. Must be aligned to host page size.
+* \param[in] handleType  - Type of handle requested (defines type and size of the \p handle output parameter)
+* \param[in] flags       - When requesting CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD the value could be
+*                          ::CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE, otherwise 0.
+* 
+* \return
+* CUDA_SUCCESS 
+* CUDA_ERROR_INVALID_VALUE 
+* CUDA_ERROR_NOT_SUPPORTED 
+*/
+CUresult CUDAAPI cuMemGetHandleForAddressRange(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags);
+
+/**
+ * \brief Bitmasks for CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK.
+ */
+typedef enum CUmemDecompressAlgorithm_enum {
+    CU_MEM_DECOMPRESS_UNSUPPORTED       = 0,    /**< Decompression is unsupported. */
+    CU_MEM_DECOMPRESS_ALGORITHM_DEFLATE = 1<<0, /**< Deflate is supported. */
+    CU_MEM_DECOMPRESS_ALGORITHM_SNAPPY  = 1<<1  /**< Snappy is supported. */
+} CUmemDecompressAlgorithm;
+
+/**
+ * \brief Structure describing the parameters that compose a single
+ *        decompression operation.
+ */
+typedef struct CUmemDecompressParams_st {
+    /** The number of bytes to be read and decompressed from
+     *  ::CUmemDecompressParams_st.src. */
+    size_t srcNumBytes;
+    /** The number of bytes that the decompression operation will be expected to
+     *  write to ::CUmemDecompressParams_st.dst. This value is optional; if
+     *  present, it may be used by the CUDA driver as a heuristic for scheduling
+     *  the individual decompression operations. */
+    size_t dstNumBytes;
+    /** After the decompression operation has completed, the actual number of
+     * bytes written to ::CUmemDecompressParams.dst will be recorded as a 32-bit
+     * unsigned integer in the memory at this address. */
+    cuuint32_t *dstActBytes;
+    /** Pointer to a buffer of at least ::CUmemDecompressParams_st.srcNumBytes
+      * compressed bytes. */
+    const void *src;
+    /** Pointer to a buffer where the decompressed data will be written. The
+      * number of bytes written to this location will be recorded in the memory
+      * pointed to by ::CUmemDecompressParams_st.dstActBytes */
+    void *dst;
+    /** The decompression algorithm to use. */
+    CUmemDecompressAlgorithm algo;
+    /*  These bytes are unused and must be zeroed. This ensures compatibility if
+     *  additional fields are added in the future. */
+    unsigned char padding[20];
+} CUmemDecompressParams;
+
+/**
+ * \brief   Submit a batch of \p count independent decompression operations.
+ *
+ * \details Each of the \p count decompression operations is described by a
+ *          single entry in the \p paramsArray array. Once the batch has been
+ *          submitted, the function will return, and decompression will happen
+ *          asynchronously w.r.t. the CPU. To the work completion tracking
+ *          mechanisms in the CUDA driver, the batch will be considered a single
+ *          unit of work and processed according to stream semantics, i.e., it
+ *          is not possible to query the completion of individual decompression
+ *          operations within a batch.
+ *
+ *          The memory pointed to by each of ::CUmemDecompressParams.src,
+ *          ::CUmemDecompressParams.dst, and ::CUmemDecompressParams.dstActBytes,
+ *          must be capable of usage with the hardware decompress feature. That
+ *          is, for each of said pointers, the pointer attribute
+ *          ::CU_POINTER_ATTRIBUTE_IS_MEM_DECOMPRESS_CAPABLE should give a
+ *          non-zero value. To ensure this, the memory backing the pointers
+ *          should have been allocated using one of the following CUDA memory
+ *          allocators:
+ *          * ::cuMemAlloc()
+ *          * ::cuMemCreate() with the usage flag ::CU_MEM_CREATE_USAGE_HW_DECOMPRESS
+ *          * ::cuMemAllocFromPoolAsync() from a pool that was created with
+ *            the usage flag ::CU_MEM_POOL_CREATE_USAGE_HW_DECOMPRESS
+ *          Additionally, ::CUmemDecompressParams.src, ::CUmemDecompressParams.dst,
+ *          and ::CUmemDecompressParams.dstActBytes, must all be accessible from
+ *          the device associated with the context where \p stream was created.
+ *          For information on how to ensure this, see the documentation for the
+ *          allocator of interest.
+ *
+ * \param[in]  paramsArray  The array of structures describing the independent
+ *                          decompression operations.
+ * \param[in]  count        The number of entries in \p paramsArray array.
+ * \param[in]  flags        Must be 0.
+ * \param[out] errorIndex   The index into \p paramsArray of the decompression
+ *                          operation for which the error returned by this
+ *                          function pertains to. If \p index is SIZE_MAX and
+ *                          the value returned is not ::CUDA_SUCCESS, then the
+ *                          error returned by this function should be considered
+ *                          a general error that does not pertain to a
+ *                          particular decompression operation. May be \p NULL,
+ *                          in which case, no index will be recorded in the
+ *                          event of error.
+ * \param[in]  stream       The stream where the work will be enqueued.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemAlloc, ::cuMemPoolCreate, ::cuMemAllocFromPoolAsync
+ */
+CUresult CUDAAPI cuMemBatchDecompressAsync(
+    CUmemDecompressParams *paramsArray,
+    size_t count,
+    unsigned int flags,
+    size_t *errorIndex,
+    CUstream stream
+);
+
+/** @} */ /* END CUDA_MEM */
+
+/**
+ * \defgroup CUDA_VA Virtual Memory Management
+ *
+ * ___MANBRIEF___ virtual memory management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the virtual memory management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+* \brief Allocate an address range reservation. 
+* 
+* Reserves a virtual address range based on the given parameters, giving
+* the starting address of the range in \p ptr.  This API requires a system that
+* supports UVA.  The size and address parameters must be a multiple of the
+* host page size and the alignment must be a power of two or zero for default
+* alignment.
+*
+* \param[out] ptr       - Resulting pointer to start of virtual address range allocated
+* \param[in]  size      - Size of the reserved virtual address range requested
+* \param[in]  alignment - Alignment of the reserved virtual address range requested
+* \param[in]  addr      - Fixed starting address range requested
+* \param[in]  flags     - Currently unused, must be zero
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_OUT_OF_MEMORY,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemAddressFree
+*/
+CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags);
+
+/**
+* \brief Free an address range reservation.
+* 
+* Frees a virtual address range reserved by cuMemAddressReserve.  The size
+* must match what was given to memAddressReserve and the ptr given must
+* match what was returned from memAddressReserve.
+*
+* \param[in] ptr  - Starting address of the virtual address range to free
+* \param[in] size - Size of the virtual address region to free
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemAddressReserve
+*/
+CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size);
+
+/**
+* \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties
+*
+* This creates a memory allocation on the target device specified through the
+* \p prop structure. The created allocation will not have any device or host
+* mappings. The generic memory \p handle for the allocation can be
+* mapped to the address space of calling process via ::cuMemMap. This handle
+* cannot be transmitted directly to other processes (see
+* ::cuMemExportToShareableHandle).  On Windows, the caller must also pass
+* an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which
+* limits or allows access to this handle for a recipient process (see
+* ::CUmemAllocationProp::win32HandleMetaData for more).  The \p size of this
+* allocation must be a multiple of the the value given via
+* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM
+* flag.
+* To create a CPU allocation targeting a specific host NUMA node, applications must
+* set ::CUmemAllocationProp::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
+* ::CUmemAllocationProp::CUmemLocation::id must specify the NUMA ID of the CPU.
+* On systems where NUMA is not available ::CUmemAllocationProp::CUmemLocation::id must be set to 0.
+* Specifying ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT or ::CU_MEM_LOCATION_TYPE_HOST as the
+* ::CUmemLocation::type will result in ::CUDA_ERROR_INVALID_VALUE.
+*
+* Applications that intend to use ::CU_MEM_HANDLE_TYPE_FABRIC based memory sharing must ensure:
+* (1) `nvidia-caps-imex-channels` character device is created by the driver and is listed under /proc/devices 
+* (2) have at least one IMEX channel file accessible by the user launching the application.
+*
+* When exporter and importer CUDA processes have been granted access to the same IMEX channel, they can securely
+* share memory.
+*
+* The IMEX channel security model works on a per user basis. Which means all processes under a user can share
+* memory if the user has access to a valid IMEX channel. When multi-user isolation is desired, a separate IMEX
+* channel is required for each user.
+*
+* These channel files exist in /dev/nvidia-caps-imex-channels/channel* and can be created using standard OS
+* native calls like mknod on Linux. For example: To create channel0 with the major number from /proc/devices
+* users can execute the following command: `mknod /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
+*
+* If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then
+* the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays
+* and sparse CUDA mipmapped arrays.
+* (see ::cuMemMapArrayAsync).
+*
+* \param[out] handle - Value of handle returned. All operations on this allocation are to be performed using this handle.
+* \param[in]  size   - Size of the allocation requested
+* \param[in]  prop   - Properties of the allocation to create.
+* \param[in]  flags  - flags for future use, must be zero now.
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_OUT_OF_MEMORY,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+*
+* \sa ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags);
+
+/**
+* \brief Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate.
+* 
+* Frees the memory that was allocated on a device through cuMemCreate.
+*
+* The memory allocation will be freed when all outstanding mappings to the memory
+* are unmapped and when all outstanding references to the handle (including it's
+* shareable counterparts) are also released. The generic memory handle can be
+* freed when there are still outstanding mappings made with this handle. Each
+* time a recipient process imports a shareable handle, it needs to pair it with
+* ::cuMemRelease for the handle to be freed.  If \p handle is not a valid handle
+* the behavior is undefined. 
+*
+* \param[in] handle Value of handle which was returned previously by cuMemCreate.
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+*
+* \sa ::cuMemCreate
+*/
+CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle);
+
+/**
+* \brief Maps an allocation handle to a reserved virtual address range.
+*
+* Maps bytes of memory represented by \p handle starting from byte \p offset to
+* \p size to address range [\p addr, \p addr + \p size]. This range must be an
+* address reservation previously reserved with ::cuMemAddressReserve, and
+* \p offset + \p size must be less than the size of the memory allocation.
+* Both \p ptr, \p size, and \p offset must be a multiple of the value given via
+* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag.
+* If \p handle represents a multicast object, \p ptr, \p size and \p offset must
+* be aligned to the value returned by ::cuMulticastGetGranularity with the flag
+* ::CU_MULTICAST_MINIMUM_GRANULARITY. For best performance however, it is
+* recommended that \p ptr, \p size and \p offset be aligned to the value
+* returned by ::cuMulticastGetGranularity with the flag
+* ::CU_MULTICAST_RECOMMENDED_GRANULARITY.
+* 
+* Please note calling ::cuMemMap does not make the address accessible,
+* the caller needs to update accessibility of a contiguous mapped VA
+* range by calling ::cuMemSetAccess.
+* 
+* Once a recipient process obtains a shareable memory handle
+* from ::cuMemImportFromShareableHandle, the process must
+* use ::cuMemMap to map the memory into its address ranges before
+* setting accessibility with ::cuMemSetAccess.
+*  
+* ::cuMemMap can only create mappings on VA range reservations 
+* that are not currently mapped.
+* 
+* \param[in] ptr    - Address where memory will be mapped. 
+* \param[in] size   - Size of the memory mapping. 
+* \param[in] offset - Offset into the memory represented by 
+*                   - \p handle from which to start mapping
+*                   - Note: currently must be zero.
+* \param[in] handle - Handle to a shareable memory 
+* \param[in] flags  - flags for future use, must be zero now. 
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_OUT_OF_MEMORY,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+*
+* \sa ::cuMemUnmap, ::cuMemSetAccess, ::cuMemCreate, ::cuMemAddressReserve, ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags);
+
+/**
+ * \brief Maps or unmaps subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays
+ *
+ * Performs map or unmap operations on subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays.
+ * Each operation is specified by a ::CUarrayMapInfo entry in the \p mapInfoList array of size \p count.
+ * The structure ::CUarrayMapInfo is defined as follow:
+ \code
+     typedef struct CUarrayMapInfo_st {
+        CUresourcetype resourceType;                   
+        union {
+            CUmipmappedArray mipmap;
+            CUarray array;
+        } resource;
+
+        CUarraySparseSubresourceType subresourceType;   
+        union {
+            struct {
+                unsigned int level;                     
+                unsigned int layer;                     
+                unsigned int offsetX;                   
+                unsigned int offsetY;                   
+                unsigned int offsetZ;                   
+                unsigned int extentWidth;               
+                unsigned int extentHeight;              
+                unsigned int extentDepth;               
+            } sparseLevel;
+            struct {
+                unsigned int layer;
+                unsigned long long offset;              
+                unsigned long long size;                
+            } miptail;
+        } subresource;
+
+        CUmemOperationType memOperationType;
+        
+        CUmemHandleType memHandleType;                  
+        union {
+            CUmemGenericAllocationHandle memHandle;
+        } memHandle;
+
+        unsigned long long offset;                      
+        unsigned int deviceBitMask;                     
+        unsigned int flags;                             
+        unsigned int reserved[2];                       
+    } CUarrayMapInfo;
+ \endcode
+ *
+ * where ::CUarrayMapInfo::resourceType specifies the type of resource to be operated on.
+ * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_ARRAY then 
+ * ::CUarrayMapInfo::resource::array must be set to a valid sparse CUDA array handle.
+ * The CUDA array must be either a 2D, 2D layered or 3D CUDA array and must have been allocated using
+ * ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE
+ * or ::CUDA_ARRAY3D_DEFERRED_MAPPING.
+ * For CUDA arrays obtained using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned.
+ * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY 
+ * then ::CUarrayMapInfo::resource::mipmap must be set to a valid sparse CUDA mipmapped array handle.
+ * The CUDA mipmapped array must be either a 2D, 2D layered or 3D CUDA mipmapped array and must have been
+ * allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE
+ * or ::CUDA_ARRAY3D_DEFERRED_MAPPING.
+ *
+ * ::CUarrayMapInfo::subresourceType specifies the type of subresource within the resource. 
+ * ::CUarraySparseSubresourceType_enum is defined as:
+ \code
+    typedef enum CUarraySparseSubresourceType_enum {
+        CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0,
+        CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1
+    } CUarraySparseSubresourceType;
+ \endcode
+ *
+ * where ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL indicates a
+ * sparse-miplevel which spans at least one tile in every dimension. The remaining miplevels which
+ * are too small to span at least one tile in any dimension constitute the mip tail region as indicated by 
+ * ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL subresource type.
+ *
+ * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL
+ * then ::CUarrayMapInfo::subresource::sparseLevel struct must contain valid array subregion offsets and extents.
+ * The ::CUarrayMapInfo::subresource::sparseLevel::offsetX, ::CUarrayMapInfo::subresource::sparseLevel::offsetY
+ * and ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must specify valid X, Y and Z offsets respectively.
+ * The ::CUarrayMapInfo::subresource::sparseLevel::extentWidth, ::CUarrayMapInfo::subresource::sparseLevel::extentHeight
+ * and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth must specify valid width, height and depth extents respectively.
+ * These offsets and extents must be aligned to the corresponding tile dimension.
+ * For CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::level must specify a valid mip level index. Otherwise,
+ * must be zero.
+ * For layered CUDA arrays and layered CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::layer must specify a valid layer index. Otherwise,
+ * must be zero.
+ * ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must be zero and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth
+ * must be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped arrays.
+ * Tile extents can be obtained by calling ::cuArrayGetSparseProperties and ::cuMipmappedArrayGetSparseProperties
+ *
+ * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL
+ * then ::CUarrayMapInfo::subresource::miptail struct must contain valid mip tail offset in 
+ * ::CUarrayMapInfo::subresource::miptail::offset and size in ::CUarrayMapInfo::subresource::miptail::size.
+ * Both, mip tail offset and mip tail size must be aligned to the tile size. 
+ * For layered CUDA mipmapped arrays which don't have the flag ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL set in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags
+ * as returned by ::cuMipmappedArrayGetSparseProperties, ::CUarrayMapInfo::subresource::miptail::layer must specify a valid layer index.
+ * Otherwise, must be zero.
+ *
+ * If ::CUarrayMapInfo::resource::array or ::CUarrayMapInfo::resource::mipmap was created with ::CUDA_ARRAY3D_DEFERRED_MAPPING
+ * flag set the ::CUarrayMapInfo::subresourceType and the contents of ::CUarrayMapInfo::subresource will be ignored.
+ *
+ * ::CUarrayMapInfo::memOperationType specifies the type of operation. ::CUmemOperationType is defined as:
+ \code
+    typedef enum CUmemOperationType_enum {
+        CU_MEM_OPERATION_TYPE_MAP = 1,
+        CU_MEM_OPERATION_TYPE_UNMAP = 2
+    } CUmemOperationType;
+ \endcode
+ * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP then the subresource 
+ * will be mapped onto the tile pool memory specified by ::CUarrayMapInfo::memHandle at offset ::CUarrayMapInfo::offset. 
+ * The tile pool allocation has to be created by specifying the ::CU_MEM_CREATE_USAGE_TILE_POOL flag when calling ::cuMemCreate. Also, 
+ * ::CUarrayMapInfo::memHandleType must be set to ::CUmemHandleType::CU_MEM_HANDLE_TYPE_GENERIC.
+ * 
+ * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_UNMAP then an unmapping operation
+ * is performed. ::CUarrayMapInfo::memHandle must be NULL.
+ *
+ * ::CUarrayMapInfo::deviceBitMask specifies the list of devices that must map or unmap physical memory. 
+ * Currently, this mask must have exactly one bit set, and the corresponding device must match the device associated with the stream. 
+ * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP, the device must also match 
+ * the device associated with the tile pool memory allocation as specified by ::CUarrayMapInfo::memHandle.
+ *
+ * ::CUarrayMapInfo::flags and ::CUarrayMapInfo::reserved[] are unused and must be set to zero.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ *
+ * \param[in] mapInfoList - List of ::CUarrayMapInfo
+ * \param[in] count       - Count of ::CUarrayMapInfo  in \p mapInfoList
+ * \param[in] hStream     - Stream identifier for the stream to use for map or unmap operations
+ *
+ * \sa ::cuMipmappedArrayCreate, ::cuArrayCreate, ::cuArray3DCreate, ::cuMemCreate, ::cuArrayGetSparseProperties, ::cuMipmappedArrayGetSparseProperties
+ */
+CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo  *mapInfoList, unsigned int count, CUstream hStream);
+
+/**
+* \brief Unmap the backing memory of a given address range.
+*
+* The range must be the entire contiguous address range that was mapped to.  In
+* other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped
+* by ::cuMemCreate / ::cuMemMap.  Any backing memory allocations will be freed
+* if there are no existing mappings and there are no unreleased memory handles.
+*
+* When ::cuMemUnmap returns successfully the address range is converted to an
+* address reservation and can be used for a future calls to ::cuMemMap.  Any new
+* mapping to this virtual address will need to have access granted through
+* ::cuMemSetAccess, as all mappings start with no accessibility setup.
+*
+* \param[in] ptr  - Starting address for the virtual address range to unmap
+* \param[in] size - Size of the virtual address range to unmap
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+* \note_sync
+*
+* \sa ::cuMemCreate, ::cuMemAddressReserve
+*/
+CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size);
+
+/**
+* \brief Set the access flags for each location specified in \p desc for the given virtual address range
+* 
+* Given the virtual address range via \p ptr and \p size, and the locations
+* in the array given by \p desc and \p count, set the access flags for the
+* target locations.  The range must be a fully mapped address range
+* containing all allocations created by ::cuMemMap / ::cuMemCreate.
+* Users cannot specify ::CU_MEM_LOCATION_TYPE_HOST_NUMA accessibility for allocations created on with other location types.
+* Note: When ::CUmemAccessDesc::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST_NUMA, ::CUmemAccessDesc::CUmemLocation::id
+* is ignored.
+* When setting the access flags for a virtual address range mapping a multicast
+* object, \p ptr and \p size must be aligned to the value returned by
+* ::cuMulticastGetGranularity with the flag ::CU_MULTICAST_MINIMUM_GRANULARITY.
+* For best performance however, it is recommended that \p ptr and \p size be
+* aligned to the value returned by ::cuMulticastGetGranularity with the flag
+* ::CU_MULTICAST_RECOMMENDED_GRANULARITY.
+*
+* \param[in] ptr   - Starting address for the virtual address range
+* \param[in] size  - Length of the virtual address range
+* \param[in] desc  - Array of ::CUmemAccessDesc that describe how to change the
+*                  - mapping for each location specified
+* \param[in] count - Number of ::CUmemAccessDesc in \p desc
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+* \note_sync
+*
+* \sa ::cuMemSetAccess, ::cuMemCreate, :cuMemMap
+*/
+CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count);
+
+/**
+* \brief Get the access \p flags set for the given \p location and \p ptr
+*
+* \param[out] flags   - Flags set for this location
+* \param[in] location - Location in which to check the flags for
+* \param[in] ptr      - Address in which to check the access flags for
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemSetAccess
+*/
+CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr);
+
+/**
+* \brief Exports an allocation to a requested shareable handle type
+*
+* Given a CUDA memory handle, create a shareable memory
+* allocation handle that can be used to share the memory with other
+* processes. The recipient process can convert the shareable handle back into a
+* CUDA memory handle using ::cuMemImportFromShareableHandle and map
+* it with ::cuMemMap. The implementation of what this handle is and how it
+* can be transferred is defined by the requested handle type in \p handleType
+*
+* Once all shareable handles are closed and the allocation is released, the allocated
+* memory referenced will be released back to the OS and uses of the CUDA handle afterward
+* will lead to undefined behavior.
+*
+* This API can also be used in conjunction with other APIs (e.g. Vulkan, OpenGL)
+* that support importing memory from the shareable type
+*
+* \param[out] shareableHandle - Pointer to the location in which to store the requested handle type
+* \param[in] handle           - CUDA handle for the memory allocation
+* \param[in] handleType       - Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter)
+* \param[in] flags            - Reserved, must be zero
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags);
+
+/**
+* \brief Imports an allocation from a requested shareable handle type.
+*
+* If the current process cannot support the memory described by this shareable
+* handle, this API will error as ::CUDA_ERROR_NOT_SUPPORTED.
+*
+* If \p shHandleType is ::CU_MEM_HANDLE_TYPE_FABRIC and the importer process has not been
+* granted access to the same IMEX channel as the exporter process, this API will error
+* as ::CUDA_ERROR_NOT_PERMITTED.
+*
+* \note Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc)
+* created on devices under an SLI group may not be supported, and thus this API will
+* return CUDA_ERROR_NOT_SUPPORTED.
+* There is no guarantee that the contents of \p handle will be the same CUDA memory handle
+* for the same given OS shareable handle, or the same underlying allocation.
+*
+* \param[out] handle       - CUDA Memory handle for the memory allocation.
+* \param[in]  osHandle     - Shareable Handle representing the memory allocation that is to be imported. 
+* \param[in]  shHandleType - handle type of the exported handle ::CUmemAllocationHandleType.
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemExportToShareableHandle, ::cuMemMap, ::cuMemRelease
+*/
+CUresult CUDAAPI cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType);
+
+/**
+* \brief Calculates either the minimal or recommended granularity 
+*
+* Calculates either the minimal or recommended granularity
+* for a given allocation specification and returns it in granularity.  This
+* granularity can be used as a multiple for alignment, size, or address mapping.
+*
+* \param[out] granularity Returned granularity.
+* \param[in]  prop Property for which to determine the granularity for
+* \param[in]  option Determines which granularity to return
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemCreate, ::cuMemMap
+*/
+CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option);
+
+/**
+* \brief Retrieve the contents of the property structure defining properties for this handle
+*
+* \param[out] prop  - Pointer to a properties structure which will hold the information about this handle
+* \param[in] handle - Handle which to perform the query on
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemCreate, ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle);
+
+/**
+* \brief Given an address \p addr, returns the allocation handle of the backing memory allocation.
+*
+* The handle is guaranteed to be the same handle value used to map the memory. If the address
+* requested is not mapped, the function will fail. The returned handle must be released with
+* corresponding number of calls to ::cuMemRelease.
+*
+* \note The address \p addr, can be any address in a range previously mapped
+* by ::cuMemMap, and not necessarily the start address.
+*
+* \param[out] handle CUDA Memory handle for the backing memory allocation.
+* \param[in] addr Memory address to query, that has been mapped previously.
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemCreate, ::cuMemRelease, ::cuMemMap
+*/
+CUresult CUDAAPI cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr);
+
+/** @} */ /* END CUDA_VA */
+
+/**
+ * \defgroup CUDA_MALLOC_ASYNC Stream Ordered Memory Allocator
+ *
+ * ___MANBRIEF___ Functions for performing allocation and free operations in stream order.
+ *                Functions for controlling the behavior of the underlying allocator.
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream ordered memory allocator exposed by the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ *
+ * \section CUDA_MALLOC_ASYNC_overview overview
+ *
+ * The asynchronous allocator allows the user to allocate and free in stream order.
+ * All asynchronous accesses of the allocation must happen between
+ * the stream executions of the allocation and the free. If the memory is accessed
+ * outside of the promised stream order, a use before allocation / use after free error
+ * will cause undefined behavior.
+ *
+ * The allocator is free to reallocate the memory as long as it can guarantee
+ * that compliant memory accesses will not overlap temporally.
+ * The allocator may refer to internal stream ordering as well as inter-stream dependencies
+ * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee.
+ * The allocator may also insert inter-stream dependencies to establish the temporal guarantee. 
+ *
+ * \section CUDA_MALLOC_ASYNC_support Supported Platforms
+ *
+ * Whether or not a device supports the integrated stream ordered memory allocator
+ * may be queried by calling ::cuDeviceGetAttribute() with the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED
+ */
+
+/**
+ * \brief Frees memory with stream ordered semantics
+ *
+ * Inserts a free operation into \p hStream.
+ * The allocation must not be accessed after stream execution reaches the free.
+ * After this API returns, accessing the memory from any subsequent work launched on the GPU
+ * or querying its pointer attributes results in undefined behavior.
+ *
+ * \note During stream capture, this function results in the creation of a free node and
+ *       must therefore be passed the address of a graph allocation.
+ * 
+ * \param dptr - memory to free
+ * \param hStream - The stream establishing the stream ordering contract. 
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream);
+
+/**
+ * \brief Allocates memory with stream ordered semantics
+ *
+ * Inserts an allocation operation into \p hStream.
+ * A pointer to the allocated memory is returned immediately in *dptr.
+ * The allocation must not be accessed until the the allocation operation completes.
+ * The allocation comes from the memory pool current to the stream's device.
+ *
+ * \note The default memory pool of a device contains device memory from that device.
+ * \note Basic stream ordering allows future work submitted into the same stream to use the allocation.
+ *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+ *       operation completes before work submitted in a separate stream runs. 
+ * \note During stream capture, this function results in the creation of an allocation node.  In this case,
+ *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+ *       are used to set the node's creation parameters.
+ *
+ * \param[out] dptr    - Returned device pointer
+ * \param[in] bytesize - Number of bytes to allocate
+ * \param[in] hStream  - The stream establishing the stream ordering contract and the memory pool to allocate from
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemAllocFromPoolAsync, ::cuMemFreeAsync, ::cuDeviceSetMemPool,
+ *     ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate,
+ *     ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute
+ */
+CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream);
+
+/**
+ * \brief Tries to release memory back to the OS
+ *
+ * Releases memory back to the OS until the pool contains fewer than minBytesToKeep
+ * reserved bytes, or there is no more memory that the allocator can safely release.
+ * The allocator cannot release OS allocations that back outstanding asynchronous allocations.
+ * The OS allocations may happen at different granularity from the user allocations.
+ *
+ * \note: Allocations that have not been freed count as outstanding. 
+ * \note: Allocations that have been asynchronously freed but whose completion has
+ *        not been observed on the host (eg. by a synchronize) can count as outstanding.
+ *
+ * \param[in] pool           - The memory pool to trim
+ * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved,
+ * the TrimTo operation is a no-op.  Otherwise the pool will be guaranteed to have
+ * at least minBytesToKeep bytes reserved after the operation.
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep);
+
+/**
+ * \brief Sets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t)
+ *                    Amount of reserved memory in bytes to hold onto before trying
+ *                    to release memory back to the OS. When more than the release
+ *                    threshold bytes of memory are held by the memory pool, the
+ *                    allocator will try to release memory back to the OS on the
+ *                    next call to stream, event or context synchronize. (default 0)
+ * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to use memory asynchronously freed
+ *                    in another stream as long as a stream ordering dependency
+ *                    of the allocating stream on the free action exists.
+ *                    Cuda events and null stream interactions can create the required
+ *                    stream ordered dependencies. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int)
+ *                    Allow reuse of already completed frees when there is no dependency
+ *                    between the free and allocation. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to insert new stream dependencies
+ *                    in order to establish the stream ordering required to reuse
+ *                    a piece of memory released by ::cuMemFreeAsync (default enabled).
+ * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t)
+ *                    Reset the high watermark that tracks the amount of backing memory that was
+ *                    allocated for the memory pool. It is illegal to set this attribute to a non-zero value.
+ * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t)
+ *                    Reset the high watermark that tracks the amount of used memory that was
+ *                    allocated for the memory pool.
+ *
+ * \param[in] pool  - The memory pool to modify
+ * \param[in] attr  - The attribute to modify
+ * \param[in] value - Pointer to the value to assign
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+
+/**
+ * \brief Gets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t)
+ *                    Amount of reserved memory in bytes to hold onto before trying
+ *                    to release memory back to the OS. When more than the release
+ *                    threshold bytes of memory are held by the memory pool, the
+ *                    allocator will try to release memory back to the OS on the
+ *                    next call to stream, event or context synchronize. (default 0)
+ * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to use memory asynchronously freed
+ *                    in another stream as long as a stream ordering dependency
+ *                    of the allocating stream on the free action exists.
+ *                    Cuda events and null stream interactions can create the required
+ *                    stream ordered dependencies. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int)
+ *                    Allow reuse of already completed frees when there is no dependency
+ *                    between the free and allocation. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to insert new stream dependencies
+ *                    in order to establish the stream ordering required to reuse
+ *                    a piece of memory released by ::cuMemFreeAsync (default enabled).
+ * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: (value type = cuuint64_t)
+ *                    Amount of backing memory currently allocated for the mempool
+ * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t)
+ *                    High watermark of backing memory allocated for the mempool since the
+ *                    last time it was reset.
+ * - ::CU_MEMPOOL_ATTR_USED_MEM_CURRENT: (value type = cuuint64_t)
+ *                    Amount of memory from the pool that is currently in use by the application.
+ * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t)
+ *                    High watermark of the amount of memory from the pool that was in use by the application.
+ *
+ * \param[in] pool   - The memory pool to get attributes of
+ * \param[in] attr   - The attribute to get 
+ * \param[out] value - Retrieved value
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+
+/**
+ * \brief Controls visibility of pools between devices
+ *
+ * \param[in] pool  - The pool being modified
+ * \param[in] map   - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu.
+ * \param[in] count - Number of descriptors in the map array.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count);
+
+/**
+ * \brief Returns the accessibility of a pool from a device
+ *
+ * Returns the accessibility of the pool's memory from the specified location. 
+ *
+ * \param[out] flags   - the accessibility of the pool from the specified location
+ * \param[in] memPool  - the pool being queried
+ * \param[in] location - the location accessing the pool
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation *location);
+
+/**
+ * \brief Creates a memory pool
+ *
+ * Creates a CUDA memory pool and returns the handle in \p pool.  The \p poolProps determines
+ * the properties of the pool such as the backing device and IPC capabilities. 
+ *
+ * To create a memory pool targeting a specific host NUMA node, applications must
+ * set ::CUmemPoolProps::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
+ * ::CUmemPoolProps::CUmemLocation::id must specify the NUMA ID of the host memory node.
+ * Specifying ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT or ::CU_MEM_LOCATION_TYPE_HOST as the
+ * ::CUmemPoolProps::CUmemLocation::type will result in ::CUDA_ERROR_INVALID_VALUE.
+* By default, the pool's memory will be accessible from the device it is allocated on.
+ * In the case of pools created with ::CU_MEM_LOCATION_TYPE_HOST_NUMA, their default accessibility
+ * will be from the host CPU.
+ * Applications can control the maximum size of the pool by specifying a non-zero value for ::CUmemPoolProps::maxSize.
+ * If set to 0, the maximum size of the pool will default to a system dependent value.
+ *
+ * Applications that intend to use ::CU_MEM_HANDLE_TYPE_FABRIC based memory sharing must ensure:
+ * (1) `nvidia-caps-imex-channels` character device is created by the driver and is listed under /proc/devices 
+ * (2) have at least one IMEX channel file accessible by the user launching the application.
+ *
+ * When exporter and importer CUDA processes have been granted access to the same IMEX channel, they can securely
+ * share memory.
+ *
+ * The IMEX channel security model works on a per user basis. Which means all processes under a user can share
+ * memory if the user has access to a valid IMEX channel. When multi-user isolation is desired, a separate IMEX
+ * channel is required for each user.
+ *
+ * These channel files exist in /dev/nvidia-caps-imex-channels/channel* and can be created using standard OS
+ * native calls like mknod on Linux. For example: To create channel0 with the major number from /proc/devices
+ * users can execute the following command: `mknod /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
+ *
+ * \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NOT_PERMITTED
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, ::cuDeviceGetDefaultMemPool,
+ *     ::cuMemAllocFromPoolAsync, ::cuMemPoolExportToShareableHandle
+ */
+CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool, const CUmemPoolProps *poolProps);
+
+/**
+ * \brief Destroys the specified memory pool
+ *
+ * If any pointers obtained from this pool haven't been freed or
+ * the pool has free operations that haven't completed
+ * when ::cuMemPoolDestroy is invoked, the function will return immediately and the
+ * resources associated with the pool will be released automatically
+ * once there are no more outstanding allocations. 
+ *
+ * Destroying the current mempool of a device sets the default mempool of
+ * that device as the current mempool for that device.
+ *
+ * \note A device's default memory pool cannot be destroyed.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemFreeAsync, ::cuDeviceSetMemPool, ::cuDeviceGetMemPool,
+ *     ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool);
+
+/**
+ * \brief Allocates memory from a specified pool with stream ordered semantics.
+ *
+ * Inserts an allocation operation into \p hStream.
+ * A pointer to the allocated memory is returned immediately in *dptr.
+ * The allocation must not be accessed until the the allocation operation completes.
+ * The allocation comes from the specified memory pool.
+ *
+ * \note
+ *    -  The specified memory pool may be from a device different than that of the specified \p hStream. 
+ * 
+ *    -  Basic stream ordering allows future work submitted into the same stream to use the allocation.
+ *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+ *       operation completes before work submitted in a separate stream runs. 
+ *
+ * \note During stream capture, this function results in the creation of an allocation node.  In this case,
+ *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+ *       are used to set the node's creation parameters.
+ *
+ * \param[out] dptr    - Returned device pointer
+ * \param[in] bytesize - Number of bytes to allocate
+ * \param[in] pool     - The pool to allocate from 
+ * \param[in] hStream  - The stream establishing the stream ordering semantic
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolSetAccess,
+ *     ::cuMemPoolSetAttribute
+ */
+CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+
+/**
+ * \brief Exports a memory pool to the requested handle type.
+ *
+ * Given an IPC capable mempool, create an OS handle to share the pool with another process.
+ * A recipient process can convert the shareable handle into a mempool with ::cuMemPoolImportFromShareableHandle.
+ * Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs.
+ * The implementation of what the shareable handle is and how it can be transferred is defined by the requested
+ * handle type.
+ *
+ * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE.
+ *
+ * \param[out] handle_out  - Returned OS handle 
+ * \param[in] pool         - pool to export 
+ * \param[in] handleType   - the type of handle to create 
+ * \param[in] flags        - must be 0 
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer,
+ *     ::cuMemPoolImportPointer, ::cuMemAllocAsync, ::cuMemFreeAsync,
+ *     ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate,
+ *     ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute
+ */
+CUresult CUDAAPI cuMemPoolExportToShareableHandle(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags);
+
+/**
+ * \brief imports a memory pool from a shared handle.
+ *
+ * Specific allocations can be imported from the imported pool with cuMemPoolImportPointer.
+ *
+ * If \p handleType is ::CU_MEM_HANDLE_TYPE_FABRIC and the importer process has not been
+ * granted access to the same IMEX channel as the exporter process, this API will error
+ * as ::CUDA_ERROR_NOT_PERMITTED.
+ * 
+ *
+ * \note Imported memory pools do not support creating new allocations.
+ *       As such imported memory pools may not be used in cuDeviceSetMemPool
+ *       or ::cuMemAllocFromPoolAsync calls.
+ *
+ * \param[out] pool_out    - Returned memory pool
+ * \param[in] handle       - OS handle of the pool to open 
+ * \param[in] handleType   - The type of handle being imported 
+ * \param[in] flags        - must be 0 
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolExportPointer, ::cuMemPoolImportPointer
+ */
+CUresult CUDAAPI cuMemPoolImportFromShareableHandle(
+        CUmemoryPool *pool_out,
+        void *handle,
+        CUmemAllocationHandleType handleType,
+        unsigned long long flags);
+
+/**
+ * \brief Export data to share a memory pool allocation between processes.
+ *
+ * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool.
+ * The recipient process can import the allocation with the ::cuMemPoolImportPointer api.
+ * The data is not a handle and may be shared through any IPC mechanism.
+ *
+ * \param[out] shareData_out - Returned export data  
+ * \param[in] ptr            - pointer to memory being exported
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolImportPointer
+ */
+CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out, CUdeviceptr ptr);
+
+/**
+ * \brief Import a memory pool allocation from another process.
+ *
+ * Returns in \p ptr_out a pointer to the imported memory.
+ * The imported memory must not be accessed before the allocation operation completes
+ * in the exporting process. The imported memory must be freed from all importing processes before
+ * being freed in the exporting process. The pointer may be freed with cuMemFree
+ * or cuMemFreeAsync.  If cuMemFreeAsync is used, the free must be completed
+ * on the importing process before the free operation on the exporting process.
+ *
+ * \note The cuMemFreeAsync api may be used in the exporting process before
+ *       the cuMemFreeAsync operation completes in its stream as long as the
+ *       cuMemFreeAsync in the exporting process specifies a stream with
+ *       a stream dependency on the importing process's cuMemFreeAsync.
+ *
+ * \param[out] ptr_out  - pointer to imported memory
+ * \param[in] pool      - pool from which to import
+ * \param[in] shareData - data specifying the memory to import
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer
+ */
+CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData *shareData);
+
+/** @} */ /* END CUDA_MALLOC_ASYNC */
+
+/**
+ * \defgroup CUDA_MULTICAST Multicast Object Management
+ *
+ * ___MANBRIEF___ Functions for creating multicast objects, adding devices to them and binding/unbinding memory
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the CUDA multicast object operations exposed by the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ *
+ * \section CUDA_MULTICAST_overview overview
+ *
+ * A multicast object created via ::cuMulticastCreate enables certain memory
+ * operations to be broadcast to a team of devices. Devices can be added to a
+ * multicast object via ::cuMulticastAddDevice. Memory can be bound on each
+ * participating device via either ::cuMulticastBindMem or ::cuMulticastBindAddr.
+ * Multicast objects can be mapped into a device's virtual address space using
+ * the virtual memmory management APIs (see ::cuMemMap and ::cuMemSetAccess).
+ *
+ * \section CUDA_MULTICAST_support Supported Platforms
+ *
+ * Support for multicast on a specific device can be queried using the device
+ * attribute ::CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED
+ */
+
+/**
+ * \brief Create a generic allocation handle representing a multicast object described by the given properties.
+ *
+ * This creates a multicast object as described by \p prop. The number of
+ * participating devices is specified by ::CUmulticastObjectProp::numDevices.
+ * Devices can be added to the multicast object via ::cuMulticastAddDevice.
+ * All participating devices must be added to the multicast object before memory
+ * can be bound to it. Memory is bound to the multicast object via either
+ * ::cuMulticastBindMem or ::cuMulticastBindAddr, and can be unbound via
+ * ::cuMulticastUnbind. The total amount of memory that can be bound per device
+ * is specified by :CUmulticastObjectProp::size. This size must be a multiple of
+ * the value returned by ::cuMulticastGetGranularity with the flag
+ * ::CU_MULTICAST_GRANULARITY_MINIMUM. For best performance however, the size
+ * should be aligned to the value returned by ::cuMulticastGetGranularity with
+ * the flag ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
+ *
+ * After all participating devices have been added, multicast objects can also
+ * be mapped to a device's virtual address space using the virtual memory
+ * management APIs (see ::cuMemMap and ::cuMemSetAccess). Multicast objects can
+ * also be shared with other processes by requesting a shareable handle via
+ * ::cuMemExportToShareableHandle. Note that the desired types of shareable
+ * handles must be specified in the bitmask ::CUmulticastObjectProp::handleTypes.
+ * Multicast objects can be released using the virtual memory management API
+ * ::cuMemRelease.
+ *
+ * \param[out] mcHandle     Value of handle returned.
+ * \param[in]  prop         Properties of the multicast object to create.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa ::cuMulticastAddDevice, ::cuMulticastBindMem, ::cuMulticastBindAddr, ::cuMulticastUnbind
+ * \sa ::cuMemCreate, ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle
+ */
+CUresult CUDAAPI cuMulticastCreate(CUmemGenericAllocationHandle *mcHandle, const CUmulticastObjectProp *prop);
+
+/**
+ * \brief Associate a device to a multicast object.
+ *
+ * Associates a device to a multicast object. The added device will be a part of
+ * the multicast team of size specified by CUmulticastObjectProp::numDevices
+ * during ::cuMulticastCreate.
+ * The association of the device to the multicast object is permanent during
+ * the life time of the multicast object.
+ * All devices must be added to the multicast team before any memory can be
+ * bound to any device in the team. Any calls to ::cuMulticastBindMem or
+ * ::cuMulticastBindAddr will block until all devices have been added.
+ * Similarly all devices must be added to the multicast team before a virtual
+ * address range can be mapped to the multicast object. A call to ::cuMemMap
+ * will block until all devices have been added.
+ *
+ * \param[in] mcHandle     Handle representing a multicast object.
+ * \param[in] dev          Device that will be associated to the multicast
+ *                         object.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa ::cuMulticastCreate, ::cuMulticastBindMem, ::cuMulticastBindAddr
+ */
+CUresult CUDAAPI cuMulticastAddDevice(CUmemGenericAllocationHandle mcHandle, CUdevice dev);
+
+/**
+ * \brief Bind a memory allocation represented by a handle to a multicast object.
+ *
+ * Binds a memory allocation specified by \p memHandle and created via
+ * ::cuMemCreate to a multicast object represented by \p mcHandle and created
+ * via ::cuMulticastCreate. The intended \p size of the bind, the offset in the
+ * multicast range \p mcOffset as well as the offset in the memory \p memOffset
+ * must be a multiple of the value returned by ::cuMulticastGetGranularity with
+ * the flag ::CU_MULTICAST_GRANULARITY_MINIMUM. For best performance however,
+ * \p size, \p mcOffset and \p memOffset should be aligned to the granularity of
+ * the memory allocation(see ::cuMemGetAllocationGranularity) or to the value
+ * returned by ::cuMulticastGetGranularity with the flag
+ * ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
+ *
+ * The \p size + \p memOffset cannot be larger than the size of the allocated
+ * memory. Similarly the \p size + \p mcOffset cannot be larger than the size
+ * of the multicast object.
+ * The memory allocation must have beeen created on one of the devices
+ * that was added to the multicast team via ::cuMulticastAddDevice.
+ * Externally shareable as well as imported multicast objects can be bound only
+ * to externally shareable memory.
+ * Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if there are
+ * insufficient resources required to perform the bind. This call may also
+ * return CUDA_ERROR_SYSTEM_NOT_READY if the necessary system software is not
+ * initialized or running.
+ *
+ * \param[in]  mcHandle     Handle representing a multicast object.
+ * \param[in]  mcOffset     Offset into the multicast object for attachment.
+ * \param[in]  memHandle    Handle representing a memory allocation.
+ * \param[in]  memOffset    Offset into the memory for attachment.
+ * \param[in]  size         Size of the memory that will be bound to the
+ *                          multicast object.
+ * \param[in]  flags        Flags for future use, must be zero for now.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_SYSTEM_NOT_READY
+ *
+ * \sa ::cuMulticastCreate, ::cuMulticastAddDevice, ::cuMemCreate
+ */
+CUresult CUDAAPI cuMulticastBindMem(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUmemGenericAllocationHandle memHandle, size_t memOffset, size_t size, unsigned long long flags);
+
+/**
+ * \brief Bind a memory allocation represented by a virtual address to a multicast object.
+ *
+ * Binds a memory allocation specified by its mapped address \p memptr to a
+ * multicast object represented by \p mcHandle.
+ * The memory must have been allocated via ::cuMemCreate or ::cudaMallocAsync.
+ * The intended \p size of the bind, the offset in the multicast range
+ * \p mcOffset and \p memptr must be a multiple of the value returned by
+ * ::cuMulticastGetGranularity with the flag ::CU_MULTICAST_GRANULARITY_MINIMUM.
+ * For best performance however, \p size, \p mcOffset and \p memptr should be
+ * aligned to the value returned by ::cuMulticastGetGranularity with the flag
+ * ::CU_MULTICAST_GRANULARITY_RECOMMENDED.
+ *
+ * The \p size cannot be larger than the size of the allocated memory.
+ * Similarly the \p size + \p mcOffset cannot be larger than the total size
+ * of the multicast object.
+ * The memory allocation must have beeen created on one of the devices
+ * that was added to the multicast team via ::cuMulticastAddDevice.
+ * Externally shareable as well as imported multicast objects can be bound only
+ * to externally shareable memory.
+ * Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if there are
+ * insufficient resources required to perform the bind. This call may also
+ * return CUDA_ERROR_SYSTEM_NOT_READY if the necessary system software is not
+ * initialized or running.
+ *
+ * \param[in]  mcHandle     Handle representing a multicast object.
+ * \param[in]  mcOffset     Offset into multicast va range for attachment.
+ * \param[in]  memptr       Virtual address of the memory allocation.
+ * \param[in]  size         Size of memory that will be bound to the
+ *                          multicast object.
+ * \param[in]  flags        Flags for future use, must be zero now.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_SYSTEM_NOT_READY
+ *
+ * \sa ::cuMulticastCreate, ::cuMulticastAddDevice, ::cuMemCreate
+ */
+CUresult CUDAAPI cuMulticastBindAddr(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUdeviceptr memptr, size_t size, unsigned long long flags);
+
+/**
+ * \brief Unbind any memory allocations bound to a multicast object at a given offset and upto a given size.
+ *
+ * Unbinds any memory allocations hosted on \p dev and bound to a multicast
+ * object at \p mcOffset and upto a given \p size.
+ * The intended \p size of the unbind and the offset in the multicast range
+ * ( \p mcOffset ) must be a multiple of the value returned by
+ * ::cuMulticastGetGranularity flag ::CU_MULTICAST_GRANULARITY_MINIMUM.
+ * The \p size + \p mcOffset cannot be larger than the total size of the
+ * multicast object.
+ *
+ * \note 
+ * Warning:
+ * The \p mcOffset and the \p size must match the corresponding values specified
+ * during the bind call. Any other values may result in undefined behavior.
+ *
+ * \param[in]  mcHandle     Handle representing a multicast object.
+ * \param[in]  dev          Device that hosts the memory allocation.
+ * \param[in]  mcOffset     Offset into the multicast object.
+ * \param[in]  size         Desired size to unbind.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa ::cuMulticastBindMem, ::cuMulticastBindAddr
+ */
+CUresult CUDAAPI cuMulticastUnbind(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, size_t size);
+
+/**
+* \brief Calculates either the minimal or recommended granularity for multicast object
+*
+* Calculates either the minimal or recommended granularity for a given set of
+* multicast object properties and returns it in granularity.  This granularity
+* can be used as a multiple for size, bind offsets and address mappings of the
+* multicast object.
+*
+* \param[out] granularity Returned granularity.
+* \param[in]  prop        Properties of the multicast object.
+* \param[in]  option      Determines which granularity to return.
+*
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMulticastCreate, ::cuMulticastBindMem, ::cuMulticastBindAddr, ::cuMulticastUnbind
+*/
+CUresult CUDAAPI cuMulticastGetGranularity(size_t *granularity, const CUmulticastObjectProp *prop, CUmulticastGranularity_flags option);
+
+/** @} */ /* END CUDA_MULTICAST */
+
+/**
+ * \defgroup CUDA_UNIFIED Unified Addressing
+ *
+ * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the unified addressing functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ *
+ * \section CUDA_UNIFIED_overview Overview
+ *
+ * CUDA devices can share a unified address space with the host.
+ * For these devices there is no distinction between a device
+ * pointer and a host pointer -- the same pointer value may be
+ * used to access memory from the host program and from a kernel
+ * running on the device (with exceptions enumerated below).
+ *
+ * \section CUDA_UNIFIED_support Supported Platforms
+ *
+ * Whether or not a device supports unified addressing may be
+ * queried by calling ::cuDeviceGetAttribute() with the device
+ * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.
+ *
+ * Unified addressing is automatically enabled in 64-bit processes
+ *
+ * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values
+ *
+ * It is possible to look up information about the memory which backs a
+ * pointer value.  For instance, one may want to know if a pointer points
+ * to host or device memory.  As another example, in the case of device
+ * memory, one may want to know on which CUDA device the memory
+ * resides.  These properties may be queried using the function
+ * ::cuPointerGetAttribute()
+ *
+ * Since pointers are unique, it is not necessary to specify information
+ * about the pointers specified to the various copy functions in the
+ * CUDA API.  The function ::cuMemcpy() may be used to perform a copy
+ * between two pointers, ignoring whether they point to host or device
+ * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH()
+ * unnecessary for devices supporting unified addressing).  For
+ * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be
+ * used to specify that the CUDA driver should infer the location of the
+ * pointer from its value.
+ *
+ * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory
+ *
+ * All host memory allocated in all contexts using ::cuMemAllocHost() and
+ * ::cuMemHostAlloc() is always directly accessible from all contexts on
+ * all devices that support unified addressing.  This is the case regardless
+ * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and
+ * ::CU_MEMHOSTALLOC_DEVICEMAP are specified.
+ *
+ * The pointer value through which allocated host memory may be accessed
+ * in kernels on all devices that support unified addressing is the same
+ * as the pointer value through which that memory is accessed on the host,
+ * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device
+ * pointer for these allocations.
+ *
+ * Note that this is not the case for memory allocated using the flag
+ * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below.
+ *
+ * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory
+ *
+ * Upon enabling direct access from a context that supports unified addressing
+ * to another peer context that supports unified addressing using
+ * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using
+ * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible
+ * by the current context.  The device pointer value through
+ * which any peer memory may be accessed in the current context
+ * is the same pointer value through which that memory may be
+ * accessed in the peer context.
+ *
+ * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing
+ *
+ * Not all memory may be accessed on devices through the same pointer
+ * value through which they are accessed on the host.  These exceptions
+ * are host memory registered using ::cuMemHostRegister() and host memory
+ * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED.  For these
+ * exceptions, there exists a distinct host and device address for the
+ * memory.  The device address is guaranteed to not overlap any valid host
+ * pointer range and is guaranteed to have the same value across all
+ * contexts that support unified addressing.
+ *
+ * This device address may be queried using ::cuMemHostGetDevicePointer()
+ * when a context using unified addressing is current.  Either the host
+ * or the unified device pointer value may be used to refer to this memory
+ * through ::cuMemcpy() and similar functions using the
+ * ::CU_MEMORYTYPE_UNIFIED memory type.
+ *
+ */
+
+/**
+ * \brief Returns information about a pointer
+ *
+ * The supported attributes are:
+ *
+ * - ::CU_POINTER_ATTRIBUTE_CONTEXT:
+ *
+ *      Returns in \p *data the ::CUcontext in which \p ptr was allocated or
+ *      registered.
+ *      The type of \p data must be ::CUcontext *.
+ *
+ *      If \p ptr was not allocated by, mapped by, or registered with
+ *      a ::CUcontext which uses unified virtual addressing then
+ *      ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE:
+ *
+ *      Returns in \p *data the physical memory type of the memory that
+ *      \p ptr addresses as a ::CUmemorytype enumerated value.
+ *      The type of \p data must be unsigned int.
+ *
+ *      If \p ptr addresses device memory then \p *data is set to
+ *      ::CU_MEMORYTYPE_DEVICE.  The particular ::CUdevice on which the
+ *      memory resides is the ::CUdevice of the ::CUcontext returned by the
+ *      ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr.
+ *
+ *      If \p ptr addresses host memory then \p *data is set to
+ *      ::CU_MEMORYTYPE_HOST.
+ *
+ *      If \p ptr was not allocated by, mapped by, or registered with
+ *      a ::CUcontext which uses unified virtual addressing then
+ *      ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ *      If the current ::CUcontext does not support unified virtual
+ *      addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER:
+ *
+ *      Returns in \p *data the device pointer value through which
+ *      \p ptr may be accessed by kernels running in the current
+ *      ::CUcontext.
+ *      The type of \p data must be CUdeviceptr *.
+ *
+ *      If there exists no device pointer value through which
+ *      kernels running in the current ::CUcontext may access
+ *      \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ *      If there is no current ::CUcontext then
+ *      ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ *
+ *      Except in the exceptional disjoint addressing cases discussed
+ *      below, the value returned in \p *data will equal the input
+ *      value \p ptr.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER:
+ *
+ *      Returns in \p *data the host pointer value through which
+ *      \p ptr may be accessed by by the host program.
+ *      The type of \p data must be void **.
+ *      If there exists no host pointer value through which
+ *      the host program may directly access \p ptr then
+ *      ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ *      Except in the exceptional disjoint addressing cases discussed
+ *      below, the value returned in \p *data will equal the input
+ *      value \p ptr.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS:
+ *
+ *      Returns in \p *data two tokens for use with the nv-p2p.h Linux
+ *      kernel interface. \p data must be a struct of type
+ *      CUDA_POINTER_ATTRIBUTE_P2P_TOKENS.
+ *
+ *      \p ptr must be a pointer to memory obtained from :cuMemAlloc().
+ *      Note that p2pToken and vaSpaceToken are only valid for the
+ *      lifetime of the source allocation. A subsequent allocation at
+ *      the same address may return completely different tokens.
+ *      Querying this attribute has a side effect of setting the attribute
+ *      ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that
+ *      \p ptr points to.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
+ *
+ *      A boolean attribute which when set, ensures that synchronous memory operations
+ *      initiated on the region of memory that \p ptr points to will always synchronize.
+ *      See further documentation in the section titled "API synchronization behavior"
+ *      to learn more about cases when synchronous memory operations can
+ *      exhibit asynchronous behavior.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID:
+ *
+ *      Returns in \p *data a buffer ID which is guaranteed to be unique within the process.
+ *      \p data must point to an unsigned long long.
+ *
+ *      \p ptr must be a pointer to memory obtained from a CUDA memory allocation API.
+ *      Every memory allocation from any of the CUDA memory allocation APIs will
+ *      have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs
+ *      from previous freed allocations. IDs are only unique within a single process.
+ *
+ *
+ * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED:
+ *
+ *      Returns in \p *data a boolean that indicates whether the pointer points to
+ *      managed memory or not.
+ *
+ *      If \p ptr is not a valid CUDA pointer then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL:
+ *
+ *      Returns in \p *data an integer representing a device ordinal of a device against
+ *      which the memory was allocated or registered.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE:
+ *
+ *      Returns in \p *data a boolean that indicates if this pointer maps to
+ *      an allocation that is suitable for ::cudaIpcGetMemHandle.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR:
+ *
+ *      Returns in \p *data the starting address for the allocation referenced
+ *      by the device pointer \p ptr.  Note that this is not necessarily the
+ *      address of the mapped region, but the address of the mappable address
+ *      range \p ptr references (e.g. from ::cuMemAddressReserve).
+ *
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE:
+ *
+ *      Returns in \p *data the size for the allocation referenced by the device
+ *      pointer \p ptr.  Note that this is not necessarily the size of the mapped
+ *      region, but the size of the mappable address range \p ptr references
+ *      (e.g. from ::cuMemAddressReserve).  To retrieve the size of the mapped
+ *      region, see ::cuMemGetAddressRange
+ *
+ * - ::CU_POINTER_ATTRIBUTE_MAPPED:
+ *
+ *      Returns in \p *data a boolean that indicates if this pointer is in a
+ *      valid address range that is mapped to a backing allocation.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES:
+ *
+ *      Returns a bitmask of the allowed handle types for an allocation that may
+ *      be passed to ::cuMemExportToShareableHandle.
+ * 
+ * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE:
+ * 
+ *      Returns in \p *data the handle to the mempool that the allocation was obtained from.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE:
+ *
+ *      Returns in \p *data a boolean that indicates whether the pointer points
+ *      to memory that is capable to be used for hardware accelerated
+ *      decompression.
+ *
+ * \par
+ *
+ * Note that for most allocations in the unified virtual address space
+ * the host and device pointer for accessing the allocation will be the
+ * same.  The exceptions to this are
+ *  - user memory registered using ::cuMemHostRegister
+ *  - host memory allocated using ::cuMemHostAlloc with the
+ *    ::CU_MEMHOSTALLOC_WRITECOMBINED flag
+ * For these types of allocation there will exist separate, disjoint host
+ * and device addresses for accessing the allocation.  In particular
+ *  - The host address will correspond to an invalid unmapped device address
+ *    (which will result in an exception if accessed from the device)
+ *  - The device address will correspond to an invalid unmapped host address
+ *    (which will result in an exception if accessed from the host).
+ * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER
+ * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host
+ * and device addresses from either address.
+ *
+ * \param data      - Returned pointer attribute value
+ * \param attribute - Pointer attribute to query
+ * \param ptr       - Pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuPointerSetAttribute,
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuMemAllocHost,
+ * ::cuMemFreeHost,
+ * ::cuMemHostAlloc,
+ * ::cuMemHostRegister,
+ * ::cuMemHostUnregister,
+ * ::cudaPointerGetAttributes
+ */
+CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr);
+
+/**
+ * \brief Prefetches memory to the specified destination device
+ * 
+ * Note there is a later version of this API, ::cuMemPrefetchAsync_v2. It will
+ * supplant this version in 13.0, which is retained for minor version compatibility.
+ *
+ * Prefetches memory to the specified destination device.  \p devPtr is the
+ * base device pointer of the memory to be prefetched and \p dstDevice is the
+ * destination device. \p count specifies the number of bytes to copy. \p hStream
+ * is the stream in which the operation is enqueued. The memory range must refer
+ * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables
+ * or it may also refer to system-allocated memory on systems with non-zero 
+ * CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ *
+ * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If
+ * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+ * must be non-zero. Additionally, \p hStream must be associated with a device that has a
+ * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ *
+ * The start address and end address of the memory range will be rounded down and rounded up
+ * respectively to be aligned to CPU page size before the prefetch operation is enqueued
+ * in the stream.
+ *
+ * If no physical memory has been allocated for this region, then this memory region
+ * will be populated and mapped on the destination device. If there's insufficient
+ * memory to prefetch the desired region, the Unified Memory driver may evict pages from other
+ * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory
+ * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted.
+ *
+ * By default, any mappings to the previous location of the migrated pages are removed and
+ * mappings for the new location are only setup on \p dstDevice. The exact behavior however
+ * also depends on the settings applied to this memory range via ::cuMemAdvise as described
+ * below:
+ *
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range,
+ * then that subset will create a read-only copy of the pages on \p dstDevice.
+ *
+ * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory
+ * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the
+ * preferred location of any pages in the memory range.
+ *
+ * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range,
+ * then mappings to those pages from all the appropriate processors are updated to
+ * refer to the new location if establishing such a mapping is possible. Otherwise,
+ * those mappings are cleared.
+ *
+ * Note that this API is not required for functionality and only serves to improve performance
+ * by allowing the application to migrate data to a suitable location before it is accessed.
+ * Memory accesses to this range are always coherent and are allowed even when the data is
+ * actively being migrated.
+ *
+ * Note that this function is asynchronous with respect to the host and all work
+ * on other devices.
+ *
+ * \param devPtr    - Pointer to be prefetched
+ * \param count     - Size in bytes
+ * \param dstDevice - Destination device to prefetch to
+ * \param hStream    - Stream to enqueue prefetch operation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ * ::cuMemcpy3DPeerAsync, ::cuMemAdvise, ::cuMemPrefetchAsync
+ * ::cudaMemPrefetchAsync_v2
+ */
+CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
+
+/**
+ * \brief Prefetches memory to the specified destination location
+ *
+ * Prefetches memory to the specified destination location.  \p devPtr is the
+ * base device pointer of the memory to be prefetched and \p location specifies the
+ * destination location. \p count specifies the number of bytes to copy. \p hStream
+ * is the stream in which the operation is enqueued. The memory range must refer
+ * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables.
+ *
+ * Specifying ::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU
+ * specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Additionally, \p hStream must be associated with a device
+ * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory.
+ * Applications can request prefetching memory to a specific host NUMA node by specifying
+ * ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id
+ * Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying
+ * ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type. Note when ::CUmemLocation::type is etiher
+ * ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored.
+ *
+ * The start address and end address of the memory range will be rounded down and rounded up
+ * respectively to be aligned to CPU page size before the prefetch operation is enqueued
+ * in the stream.
+ *
+ * If no physical memory has been allocated for this region, then this memory region
+ * will be populated and mapped on the destination device. If there's insufficient
+ * memory to prefetch the desired region, the Unified Memory driver may evict pages from other
+ * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory
+ * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted.
+ *
+ * By default, any mappings to the previous location of the migrated pages are removed and
+ * mappings for the new location are only setup on the destination location. The exact behavior however
+ * also depends on the settings applied to this memory range via ::cuMemAdvise as described
+ * below:
+ *
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range,
+ * then that subset will create a read-only copy of the pages on destination location.
+ * If however the destination location is a host NUMA node, then any pages of that subset
+ * that are already in another host NUMA node will be transferred to the destination.
+ *
+ * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory
+ * range, then the pages will be migrated to \p location even if \p location is not the
+ * preferred location of any pages in the memory range.
+ *
+ * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range,
+ * then mappings to those pages from all the appropriate processors are updated to
+ * refer to the new location if establishing such a mapping is possible. Otherwise,
+ * those mappings are cleared.
+ *
+ * Note that this API is not required for functionality and only serves to improve performance
+ * by allowing the application to migrate data to a suitable location before it is accessed.
+ * Memory accesses to this range are always coherent and are allowed even when the data is
+ * actively being migrated.
+ *
+ * Note that this function is asynchronous with respect to the host and all work
+ * on other devices.
+ *
+ * \param devPtr    - Pointer to be prefetched
+ * \param count     - Size in bytes
+ * \param dstDevice - Destination device to prefetch to
+ * \param flags     - flags for future use, must be zero now. 
+ * \param hStream   - Stream to enqueue prefetch operation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ * ::cuMemcpy3DPeerAsync, ::cuMemAdvise, ::cuMemPrefetchAsync
+ * ::cudaMemPrefetchAsync_v2
+ */
+CUresult CUDAAPI cuMemPrefetchAsync_v2(CUdeviceptr devPtr, size_t count, CUmemLocation location, unsigned int flags, CUstream hStream);
+
+/**
+ * \brief Advise about the usage of a given memory range
+ *
+ * Note there is a later version of this API, ::cuMemAdvise_v2. It will
+ * supplant this version in 13.0, which is retained for minor version compatibility.
+ * 
+ * Advise the Unified Memory subsystem about the usage pattern for the memory range
+ * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
+ * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
+ * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged
+ * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
+ * memory provided it represents a valid, host-accessible region of memory and all additional constraints
+ * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
+ * memory range results in an error being returned.
+ *
+ * The \p advice parameter can take the following values:
+ * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read
+ * from and only occasionally written to. Any read accesses from any processor to this region will create a
+ * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync
+ * is called on this region, it will create a read-only copy of the data on the destination processor.
+ * If any processor writes to this region, all copies of the corresponding page will be invalidated
+ * except for the one where the write occurred. The \p device argument is ignored for this advice.
+ * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
+ * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * Also, if a context is created on a device that does not have the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until
+ * all such contexts are destroyed.
+ * If the memory region refers to valid system-allocated pageable memory, then the accessing device must
+ * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only
+ * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice
+ * will not create a read-only copy when that device accesses this memory region.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY:  Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the
+ * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
+ * copies of the data will be collapsed into a single copy. The location for the collapsed
+ * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
+ * copies was resident at that location. Otherwise, the location chosen is arbitrary.
+ *
+ * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the
+ * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the
+ * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location
+ * does not cause data to migrate to that location immediately. Instead, it guides the migration policy
+ * when a fault occurs on that memory region. If the data is already in its preferred location and the
+ * faulting processor can establish a mapping without requiring the data to be migrated, then
+ * data migration will be avoided. On the other hand, if the data is not in its preferred location
+ * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
+ * it. It is important to note that setting the preferred location does not prevent data prefetching
+ * done using ::cuMemPrefetchAsync.
+ * Having a preferred location can override the page thrash detection and resolution logic in the Unified
+ * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
+ * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
+ * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice, unless read accesses from
+ * \p device will not result in a read-only copy being created on that device as outlined in description for
+ * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION
+ * and changes the preferred location to none.
+ *
+ * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device.
+ * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then
+ * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero.
+ * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
+ * it causes the data to always be mapped in the specified processor's page tables, as long as the
+ * location of the data permits a mapping to be established. If the data gets migrated for any reason,
+ * the mappings are updated accordingly.
+ * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
+ * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
+ * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
+ * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
+ * migration may be too high. But preventing faults can still help improve performance, and so having
+ * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
+ * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
+ * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the
+ * page in host memory.
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice. Additionally, if the
+ * preferred location of this memory region or any subset of it is also \p device, then the policies
+ * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
+ * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to
+ * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
+ * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect.
+ *
+ * \param devPtr - Pointer to memory to set the advice for
+ * \param count  - Size in bytes of the memory range
+ * \param advice - Advice to be applied for the specified memory range
+ * \param device - Device to apply the advice for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync, ::cuMemAdvise_v2
+ * ::cudaMemAdvise
+ */
+CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device);
+
+/**
+ * \brief Advise about the usage of a given memory range
+ *
+ * Advise the Unified Memory subsystem about the usage pattern for the memory range
+ * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
+ * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
+ * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged
+ * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
+ * memory provided it represents a valid, host-accessible region of memory and all additional constraints
+ * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
+ * memory range results in an error being returned.
+ *
+ * The \p advice parameter can take the following values:
+ * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read
+ * from and only occasionally written to. Any read accesses from any processor to this region will create a
+ * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync
+ * or ::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor.
+ * If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on
+ * another host NUMA node, that copy will be migrated to the targeted host NUMA node.
+ * If any processor writes to this region, all copies of the corresponding page will be invalidated
+ * except for the one where the write occurred. If the writing processor is the CPU and the preferred location of
+ * the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice.
+ * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
+ * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * Also, if a context is created on a device that does not have the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until
+ * all such contexts are destroyed.
+ * If the memory region refers to valid system-allocated pageable memory, then the accessing device must
+ * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only
+ * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice
+ * will not create a read-only copy when that device accesses this memory region.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY:  Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the
+ * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
+ * copies of the data will be collapsed into a single copy. The location for the collapsed
+ * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
+ * copies was resident at that location. Otherwise, the location chosen is arbitrary.
+ * Note: The \p location argument is ignored for this advice.
+ *
+ * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the
+ * data to be the memory belonging to \p location. When ::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST,
+ * ::CUmemLocation::id is ignored and the preferred location is set to be host memory. To set the preferred location
+ * to a specific host NUMA node, applications must set ::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and
+ * ::CUmemLocation::id must specify the NUMA ID of the host NUMA node. If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT,
+ * ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location.
+ * If ::CUmemLocation::type is a ::CU_MEM_LOCATION_TYPE_DEVICE, then ::CUmemLocation::id must be a valid device ordinal
+ * and the device must have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * Setting the preferred location does not cause data to migrate to that location immediately. Instead, it guides the migration policy
+ * when a fault occurs on that memory region. If the data is already in its preferred location and the
+ * faulting processor can establish a mapping without requiring the data to be migrated, then
+ * data migration will be avoided. On the other hand, if the data is not in its preferred location
+ * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
+ * it. It is important to note that setting the preferred location does not prevent data prefetching
+ * done using ::cuMemPrefetchAsync.
+ * Having a preferred location can override the page thrash detection and resolution logic in the Unified
+ * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
+ * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
+ * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice, unless read accesses from
+ * \p location will not result in a read-only copy being created on that procesor as outlined in description for
+ * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY.
+ * If the memory region refers to valid system-allocated pageable memory, and ::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE
+ * then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION
+ * and changes the preferred location to none. The \p location argument is ignored for this advice.
+ *
+ * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location.
+ * The ::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device
+ * ordinal or ::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid.
+ * If ::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero.
+ * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
+ * it causes the data to always be mapped in the specified processor's page tables, as long as the
+ * location of the data permits a mapping to be established. If the data gets migrated for any reason,
+ * the mappings are updated accordingly.
+ * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
+ * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
+ * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
+ * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
+ * migration may be too high. But preventing faults can still help improve performance, and so having
+ * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
+ * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
+ * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the
+ * page in host memory.
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice. Additionally, if the
+ * preferred location of this memory region or any subset of it is also \p location, then the policies
+ * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice.
+ * If the memory region refers to valid system-allocated pageable memory, and ::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE
+ * then device in ::CUmemLocation::id must have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ * Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to
+ * the data from \p location may be removed at any time causing accesses to result in non-fatal page faults.
+ * If the memory region refers to valid system-allocated pageable memory, and ::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE
+ * then device in ::CUmemLocation::id must have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ * Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect.
+ *
+ * \param devPtr   - Pointer to memory to set the advice for
+ * \param count    - Size in bytes of the memory range
+ * \param advice   - Advice to be applied for the specified memory range
+ * \param location - location to apply the advice for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync, ::cuMemAdvise
+ * ::cudaMemAdvise
+ */
+CUresult CUDAAPI cuMemAdvise_v2(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUmemLocation location);
+
+/**
+ * \brief Query an attribute of a given memory range
+ *
+ * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The
+ * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
+ * __managed__ variables.
+ *
+ * The \p attribute parameter can take the following values:
+ * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted
+ * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given
+ * memory range have read-duplication enabled, or 0 otherwise.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device
+ * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU
+ * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID
+ * if either all the pages don't have the same preferred location or some of the pages don't have a
+ * preferred location at all. Note that the actual location of the pages in the memory range at the time of
+ * the query may be different from the preferred location.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted
+ * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned
+ * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range.
+ * If any device does not have that advice set for the entire memory range, that device will not be included.
+ * If \p data is larger than the number of devices that have that advice set for that memory range,
+ * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12
+ * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be
+ * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have
+ * that advice set, then only as many devices will be returned as can fit in the array. There is no
+ * guarantee on which specific devices will be returned, however.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location
+ * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be
+ * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU
+ * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not
+ * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the
+ * last location that the application requested to prefetch the memory range to. It gives no indication as to
+ * whether the prefetch operation to that location has completed or even begun.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE: If this attribute is specified, \p data will be
+ * interpreted as a ::CUmemLocationType, and \p dataSize must be sizeof(CUmemLocationType). The ::CUmemLocationType returned will be
+ * ::CU_MEM_LOCATION_TYPE_DEVICE if all pages in the memory range have the same GPU as their preferred location, or ::CUmemLocationType
+ * will be ::CU_MEM_LOCATION_TYPE_HOST if all pages in the memory range have the CPU as their preferred location, or it will be ::CU_MEM_LOCATION_TYPE_HOST_NUMA
+ * if all the pages in the memory range have the same host NUMA node ID as their preferred location or it will be ::CU_MEM_LOCATION_TYPE_INVALID
+ * if either all the pages don't have the same preferred location or some of the pages don't have a preferred location at all.
+ * Note that the actual location type of the pages in the memory range at the time of the query may be different from the preferred location type.
+ *  - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. If the ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE query for the same address range
+ * returns ::CU_MEM_LOCATION_TYPE_DEVICE, it will be a valid device ordinal or if it returns ::CU_MEM_LOCATION_TYPE_HOST_NUMA, it will be a valid host NUMA node ID
+ * or if it returns any other location type, the id should be ignored.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE: If this attribute is specified, \p data will be
+ * interpreted as a ::CUmemLocationType, and \p dataSize must be sizeof(CUmemLocationType). The result returned will be the last location
+ * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. The ::CUmemLocationType returned
+ * will be ::CU_MEM_LOCATION_TYPE_DEVICE if the last prefetch location was a GPU or ::CU_MEM_LOCATION_TYPE_HOST if it was the CPU or ::CU_MEM_LOCATION_TYPE_HOST_NUMA if
+ * the last prefetch location was a specific host NUMA node. If any page in the memory range was never explicitly prefetched or if all pages were not
+ * prefetched to the same location, ::CUmemLocationType will be ::CU_MEM_LOCATION_TYPE_INVALID.
+ * Note that this simply returns the last location type that the application requested to prefetch the memory range to. It gives no indication as to
+ * whether the prefetch operation to that location has completed or even begun.
+ *  - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. If the ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE query for the same address range
+ * returns ::CU_MEM_LOCATION_TYPE_DEVICE, it will be a valid device ordinal or if it returns ::CU_MEM_LOCATION_TYPE_HOST_NUMA, it will be a valid host NUMA node ID
+ * or if it returns any other location type, the id should be ignored.
+ *
+ * \param data      - A pointers to a memory location where the result
+ *                    of each attribute query will be written to.
+ * \param dataSize  - Array containing the size of data
+ * \param attribute - The attribute to query
+ * \param devPtr    - Start of the range to query
+ * \param count     - Size of the range to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync,
+ * ::cuMemAdvise,
+ * ::cudaMemRangeGetAttribute
+ */
+CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count);
+
+/**
+ * \brief Query attributes of a given memory range.
+ *
+ * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The
+ * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
+ * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes
+ * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries.
+ * The results of the query will be stored in \p data.
+ *
+ * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for
+ * attribute descriptions and restrictions.
+ *
+ * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION
+ * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID
+ *
+ * \param data          - A two-dimensional array containing pointers to memory
+ *                        locations where the result of each attribute query will be written to.
+ * \param dataSizes     - Array containing the sizes of each result
+ * \param attributes    - An array of attributes to query
+ *                        (numAttributes and the number of attributes in this array should match)
+ * \param numAttributes - Number of attributes to query
+ * \param devPtr        - Start of the range to query
+ * \param count         - Size of the range to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise,
+ * ::cuMemPrefetchAsync,
+ * ::cudaMemRangeGetAttributes
+ */
+CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count);
+
+/**
+ * \brief Set attributes on a previously allocated memory region
+ *
+ * The supported attributes are:
+ *
+ * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
+ *
+ *      A boolean attribute that can either be set (1) or unset (0). When set,
+ *      the region of memory that \p ptr points to is guaranteed to always synchronize
+ *      memory operations that are synchronous. If there are some previously initiated
+ *      synchronous memory operations that are pending when this attribute is set, the
+ *      function does not return until those memory operations are complete.
+ *      See further documentation in the section titled "API synchronization behavior"
+ *      to learn more about cases when synchronous memory operations can
+ *      exhibit asynchronous behavior.
+ *      \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set.
+ *
+ * \param value     - Pointer to memory containing the value to be set
+ * \param attribute - Pointer attribute to set
+ * \param ptr       - Pointer to a memory region allocated using CUDA memory allocation APIs
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa ::cuPointerGetAttribute,
+ * ::cuPointerGetAttributes,
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuMemAllocHost,
+ * ::cuMemFreeHost,
+ * ::cuMemHostAlloc,
+ * ::cuMemHostRegister,
+ * ::cuMemHostUnregister
+ */
+CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr);
+
+/**
+ * \brief Returns information about a pointer.
+ *
+ * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions):
+ *
+ * - ::CU_POINTER_ATTRIBUTE_CONTEXT
+ * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER
+ * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER
+ * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS
+ * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID
+ * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE
+ * - ::CU_POINTER_ATTRIBUTE_MAPPED
+ * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE
+ * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES
+ * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE
+ * - ::CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE
+ *
+ * \param numAttributes - Number of attributes to query
+ * \param attributes    - An array of attributes to query
+ *                      (numAttributes and the number of attributes in this array should match)
+ * \param data          - A two-dimensional array containing pointers to memory
+ *                      locations where the result of each attribute query will be written to.
+ * \param ptr           - Pointer to query
+ *
+ * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr
+ * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values
+ * and CUDA_SUCCESS is returned.
+ *
+ * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA
+ * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuPointerGetAttribute,
+ * ::cuPointerSetAttribute,
+ * ::cudaPointerGetAttributes
+ */
+CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr);
+
+/** @} */ /* END CUDA_UNIFIED */
+
+/**
+ * \defgroup CUDA_STREAM Stream Management
+ *
+ * ___MANBRIEF___ stream management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Create a stream
+ *
+ * Creates a stream and returns a handle in \p phStream.  The \p Flags argument
+ * determines behaviors of the stream.
+ *
+ * Valid values for \p Flags are:
+ * - ::CU_STREAM_DEFAULT: Default stream creation flag.
+ * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created
+ *   stream may run concurrently with work in stream 0 (the NULL stream), and that
+ *   the created stream should perform no implicit synchronization with stream 0.
+ *
+ * \param phStream - Returned newly created stream
+ * \param Flags    - Parameters for stream creation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreateWithPriority,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags
+ */
+CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags);
+
+/**
+ * \brief Create a stream with the given priority
+ *
+ * Creates a stream with the specified priority and returns a handle in \p phStream.
+ * This affects the scheduling priority of work in the stream. Priorities provide a
+ * hint to preferentially run work with higher priority when possible, but do
+ * not preempt already-running work or provide any other functional guarantee on
+ * execution order.
+ *
+ * \p priority follows a convention where lower numbers represent higher priorities.
+ * '0' represents default priority. The range of meaningful numerical priorities can
+ * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is
+ * outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ * it will automatically be clamped to the lowest or the highest number in the range.
+ *
+ * \param phStream    - Returned newly created stream
+ * \param flags       - Flags for stream creation. See ::cuStreamCreate for a list of
+ *                      valid flags
+ * \param priority    - Stream priority. Lower numbers represent higher priorities.
+ *                      See ::cuCtxGetStreamPriorityRange for more information about
+ *                      meaningful stream priorities that can be passed.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \note Stream priorities are supported only on GPUs
+ * with compute capability 3.5 or higher.
+ *
+ * \note In the current implementation, only compute kernels launched in
+ * priority streams are affected by the stream's priority. Stream priorities have
+ * no effect on host-to-device and device-to-host memory operations.
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreateWithPriority
+ */
+CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority);
+
+
+/**
+ * \brief Query the priority of a given stream
+ *
+ * Query the priority of a stream created using ::cuStreamCreate, ::cuStreamCreateWithPriority or ::cuGreenCtxStreamCreate
+ * and return the priority in \p priority. Note that if the stream was created with a
+ * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ * this function returns the clamped priority.
+ * See ::cuStreamCreateWithPriority for details about priority clamping.
+ *
+ * \param hStream    - Handle to the stream to be queried
+ * \param priority   - Pointer to a signed integer in which the stream's priority is returned
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuStreamCreateWithPriority,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cudaStreamGetPriority
+ */
+CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
+
+/**
+ * \brief Returns the device handle of the stream
+ *
+ * Returns in \p *device the device handle of the stream
+ *
+ * \param hStream - Handle to the stream to be queried
+ * \param device - Returns the device to which a stream belongs 
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuStreamGetFlags
+ */
+CUresult CUDAAPI cuStreamGetDevice(CUstream hStream, CUdevice *device);
+
+/**
+ * \brief Query the flags of a given stream
+ *
+ * Query the flags of a stream created using ::cuStreamCreate, ::cuStreamCreateWithPriority or ::cuGreenCtxStreamCreate
+ * and return the flags in \p flags.
+ *
+ * \param hStream    - Handle to the stream to be queried
+ * \param flags      - Pointer to an unsigned integer in which the stream's flags are returned
+ *                     The value returned in \p flags is a logical 'OR' of all flags that
+ *                     were used while creating this stream. See ::cuStreamCreate for the list
+ *                     of valid flags
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cudaStreamGetFlags
+ * ::cuStreamGetDevice
+ */
+CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
+
+/**
+ * \brief Returns the unique Id associated with the stream handle supplied
+ *
+ * Returns in \p streamId the unique Id which is associated with the given stream handle.
+ * The Id is unique for the life of the program.
+ * 
+ * The stream handle \p hStream can refer to any of the following:
+ * <ul>
+ *   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
+ *   and ::cuStreamCreateWithPriority, or their runtime API equivalents such as
+ *   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
+ *   Passing an invalid handle will result in undefined behavior.</li>
+ *   <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
+ *   ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
+ *   which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.</li>
+ * </ul>
+ *
+ * \param hStream    - Handle to the stream to be queried
+ * \param streamId   - Pointer to store the Id of the stream
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cudaStreamGetId
+ */
+CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId);
+
+/**
+ * \brief Query the context associated with a stream
+ *
+ * Returns the CUDA context that the stream is associated with.
+ *
+ * Note there is a later version of this API, ::cuStreamGetCtx_v2. It will
+ * supplant this version in CUDA 13.0. It is recommended to use ::cuStreamGetCtx_v2
+ * till then as this version will return ::CUDA_ERROR_NOT_SUPPORTED for streams created via the API ::cuGreenCtxStreamCreate.
+ *
+ * The stream handle \p hStream can refer to any of the following:
+ * <ul>
+ *   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
+ *   and ::cuStreamCreateWithPriority, or their runtime API equivalents such as
+ *   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
+ *   The returned context is the context that was active in the calling thread when the
+ *   stream was created. Passing an invalid handle will result in undefined behavior.</li>
+ *   <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
+ *   ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
+ *   which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.
+ *   Specifying any of the special handles will return the context current to the
+ *   calling thread. If no context is current to the calling thread,
+ *   ::CUDA_ERROR_INVALID_CONTEXT is returned.</li>
+ * </ul>
+ *
+ * \param hStream - Handle to the stream to be queried
+ * \param pctx    - Returned context associated with the stream
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreateWithPriority,
+ * ::cuStreamGetPriority,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreate,
+ * ::cuStreamGetCtx_v2,
+ * ::cudaStreamCreateWithFlags
+ */
+CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
+
+/**
+ * \brief Query the contexts associated with a stream
+ *
+ * Returns the contexts that the stream is associated with.
+ *
+ * If the stream is associated with a green context, the API returns the green context in \p pGreenCtx
+ * and the primary context of the associated device in \p pCtx.
+ *
+ * If the stream is associated with a regular context, the API returns the regular context in \p pCtx
+ * and NULL in \p pGreenCtx.
+ *
+ * The stream handle \p hStream can refer to any of the following:
+ * <ul>
+ *   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate,
+ *   ::cuStreamCreateWithPriority and ::cuGreenCtxStreamCreate, or their runtime API equivalents such as
+ *   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
+ *   Passing an invalid handle will result in undefined behavior.</li>
+ *   <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
+ *   ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
+ *   which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.
+ *   If any of the special handles are specified, the API will operate on the context current to the
+ *   calling thread. If a green context (that was converted via ::cuCtxFromGreenCtx() before setting it current)
+ *   is current to the calling thread, the API will return the green context in \p pGreenCtx
+ *   and the primary context of the associated device in \p pCtx. If a regular context is current,
+ *   the API returns the regular context in \p pCtx and NULL in \p pGreenCtx.
+ *   Note that specifying ::CU_STREAM_PER_THREAD or ::cudaStreamPerThread will return ::CUDA_ERROR_INVALID_HANDLE
+ *   if a green context is current to the calling thread.
+ *   If no context is current to the calling thread, ::CUDA_ERROR_INVALID_CONTEXT is returned.</li>
+ * </ul>
+ *
+ * \param hStream   - Handle to the stream to be queried
+ * \param pCtx      - Returned regular context associated with the stream
+ * \param pGreenCtx - Returned green context if the stream is associated with a green context or NULL if not
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate
+ * ::cuStreamCreateWithPriority,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags,
+ */
+CUresult CUDAAPI cuStreamGetCtx_v2(CUstream hStream, CUcontext *pCtx, CUgreenCtx *pGreenCtx);
+
+/**
+ * \brief Make a compute stream wait on an event
+ *
+ * Makes all future work submitted to \p hStream wait for all work captured in
+ * \p hEvent.  See ::cuEventRecord() for details on what is captured by an event.
+ * The synchronization will be performed efficiently on the device when applicable.
+ * \p hEvent may be from a different context or device than \p hStream.
+ *
+ * flags include:
+ * - ::CU_EVENT_WAIT_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_WAIT_EXTERNAL: Event is captured in the graph as an external
+ *   event node when performing stream capture. This flag is invalid outside
+ *   of stream capture.
+ *
+ * \param hStream - Stream to wait
+ * \param hEvent  - Event to wait on (may not be NULL)
+ * \param Flags   - See ::CUevent_capture_flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuEventRecord,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cuStreamDestroy,
+ * ::cudaStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
+
+/**
+ * \brief Add a callback to a compute stream
+ *
+ * \note This function is slated for eventual deprecation and removal. If
+ * you do not require the callback to execute in case of a device error,
+ * consider using ::cuLaunchHostFunc. Additionally, this function is not
+ * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike
+ * ::cuLaunchHostFunc.
+ *
+ * Adds a callback to be called on the host after all currently enqueued
+ * items in the stream have completed.  For each
+ * cuStreamAddCallback call, the callback will be executed exactly once.
+ * The callback will block later work in the stream until it is finished.
+ *
+ * The callback may be passed ::CUDA_SUCCESS or an error code.  In the event
+ * of a device error, all subsequently executed callbacks will receive an
+ * appropriate ::CUresult.
+ *
+ * Callbacks must not make any CUDA API calls.  Attempting to use a CUDA API
+ * will result in ::CUDA_ERROR_NOT_PERMITTED.  Callbacks must not perform any
+ * synchronization that may depend on outstanding device work or other callbacks
+ * that are not mandated to run earlier.  Callbacks without a mandated order
+ * (in independent streams) execute in undefined order and may be serialized.
+ *
+ * For the purposes of Unified Memory, callback execution makes a number of
+ * guarantees:
+ * <ul>
+ *   <li>The callback stream is considered idle for the duration of the
+ *   callback.  Thus, for example, a callback may always use memory attached
+ *   to the callback stream.</li>
+ *   <li>The start of execution of a callback has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the callback.  It thus synchronizes streams which have been "joined"
+ *   prior to the callback.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding host functions and stream callbacks
+ *   have executed.  Thus, for
+ *   example, a callback might use global attached memory even if work has
+ *   been added to another stream, if the work has been ordered behind the
+ *   callback with an event.</li>
+ *   <li>Completion of a callback does not cause a stream to become
+ *   active except as described above.  The callback stream will remain idle
+ *   if no device work follows the callback, and will remain idle across
+ *   consecutive callbacks without device work in between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a callback at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * \param hStream  - Stream to add callback to
+ * \param callback - The function to call once preceding stream operations are complete
+ * \param userData - User specified data to be passed to the callback function
+ * \param flags    - Reserved for future use, must be 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuMemAllocManaged,
+ * ::cuStreamAttachMemAsync,
+ * ::cuLaunchHostFunc,
+ * ::cudaStreamAddCallback
+ */
+CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+
+/**
+ * \brief Begins graph capture on a stream
+ *
+ * Begin graph capture on \p hStream. When a stream is in capture mode, all operations
+ * pushed into the stream will not be executed, but will instead be captured into
+ * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated
+ * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which
+ * it was initiated, and it may only be initiated if the stream is not already in capture
+ * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id
+ * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo.
+ *
+ * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be
+ * called on this stream from the same thread.
+ *
+ * \param hStream - Stream in which to initiate capture
+ * \param mode    - Controls the interaction of this capture sequence with other API
+ *                  calls that are potentially unsafe. For more details see
+ *                  ::cuThreadExchangeStreamCaptureMode.
+ *
+ * \note Kernels captured using this API must not use texture and surface references.
+ *       Reading or writing through any texture or surface reference is undefined
+ *       behavior. This restriction does not apply to texture and surface objects.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamCreate,
+ * ::cuStreamIsCapturing,
+ * ::cuStreamEndCapture,
+ * ::cuThreadExchangeStreamCaptureMode
+ */
+CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode);
+
+/**
+ * \brief Begins graph capture on a stream to an existing graph
+ *
+ * Begin graph capture on \p hStream, placing new nodes into an existing graph. When a stream is 
+ * in capture mode, all operations pushed into the stream will not be executed, but will instead 
+ * be captured into \p hGraph. The graph will not be instantiable until the user calls 
+ * ::cuStreamEndCapture. 
+ *  
+ * Capture may not be initiated if \p stream is CU_STREAM_LEGACY. Capture must be ended on the 
+ * same stream in which it was initiated, and it may only be initiated if the stream is not 
+ * already in capture mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id
+ * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo.
+ *
+ * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be
+ * called on this stream from the same thread.
+ *
+ * \param hStream         - Stream in which to initiate capture.
+ * \param hGraph          - Graph to capture into.
+ * \param dependencies    - Dependencies of the first node captured in the stream.  Can be NULL if numDependencies is 0.
+ * \param dependencyData  - Optional array of data associated with each dependency.
+ * \param numDependencies - Number of dependencies.
+ * \param mode            - Controls the interaction of this capture sequence with other API
+ *                          calls that are potentially unsafe. For more details see
+ *                          ::cuThreadExchangeStreamCaptureMode.
+ *
+ * \note Kernels captured using this API must not use texture and surface references.
+ *       Reading or writing through any texture or surface reference is undefined
+ *       behavior. This restriction does not apply to texture and surface objects.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamBeginCapture,
+ * ::cuStreamCreate,
+ * ::cuStreamIsCapturing,
+ * ::cuStreamEndCapture,
+ * ::cuThreadExchangeStreamCaptureMode,
+ * ::cuGraphAddNode,
+ */
+CUresult CUDAAPI cuStreamBeginCaptureToGraph(CUstream hStream, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUstreamCaptureMode mode);
+
+/**
+ * \brief Swaps the stream capture interaction mode for a thread
+ *
+ * Sets the calling thread's stream capture interaction mode to the value contained
+ * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To
+ * facilitate deterministic behavior across function or module boundaries, callers
+ * are encouraged to use this API in a push-pop fashion: \code
+     CUstreamCaptureMode mode = desiredMode;
+     cuThreadExchangeStreamCaptureMode(&mode);
+     ...
+     cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode
+ * \endcode
+ *
+ * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call
+ * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is
+ * not enqueued asynchronously to a stream, and is not observed by stream capture.
+ * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture
+ * depended on the allocation being replayed whenever the graph is launched, the
+ * captured graph would be invalid.
+ *
+ * Therefore, stream capture places restrictions on API calls that can be made within
+ * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This
+ * behavior can be controlled via this API and flags to ::cuStreamBeginCapture.
+ *
+ * A thread's mode is one of the following:
+ * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has
+ *   an ongoing capture sequence that was not initiated with
+ *   \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread
+ *   has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL,
+ *   this thread is prohibited from potentially unsafe API calls.
+ * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture
+ *   sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited
+ *   from potentially unsafe API calls. Concurrent capture sequences in other threads
+ *   are ignored.
+ * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially
+ *   unsafe API calls. Note that the thread is still prohibited from API calls which
+ *   necessarily conflict with stream capture, for example, attempting ::cuEventQuery
+ *   on an event that was last recorded inside a capture sequence.
+ *
+ * \param mode - Pointer to mode value to swap with the current mode
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamBeginCapture
+ */
+CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode);
+
+/**
+ * \brief Ends capture on a stream, returning the captured graph
+ *
+ * End capture on \p hStream, returning the captured graph via \p phGraph.
+ * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture.
+ * If capture was invalidated, due to a violation of the rules of stream capture, then
+ * a NULL graph will be returned.
+ *
+ * If the \p mode argument to ::cuStreamBeginCapture was not
+ * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as
+ * ::cuStreamBeginCapture.
+ *
+ * \param hStream - Stream to query
+ * \param phGraph - The captured graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamCreate,
+ * ::cuStreamBeginCapture,
+ * ::cuStreamIsCapturing,
+ * ::cuGraphDestroy
+ */
+CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
+
+/**
+ * \brief Returns a stream's capture status
+ *
+ * Return the capture status of \p hStream via \p captureStatus. After a successful
+ * call, \p *captureStatus will contain one of the following:
+ * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing.
+ * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing.
+ * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error
+ *   has invalidated the capture sequence. The capture sequence must be terminated
+ *   with ::cuStreamEndCapture on the stream where it was initiated in order to
+ *   continue using \p hStream.
+ *
+ * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while
+ * a blocking stream in the same context is capturing, it will return
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified
+ * after the call. The blocking stream capture is not invalidated.
+ *
+ * When a blocking stream is capturing, the legacy stream is in an
+ * unusable state until the blocking stream capture is terminated. The legacy
+ * stream is not supported for stream capture, but attempted use would have an
+ * implicit dependency on the capturing stream(s).
+ *
+ * \param hStream       - Stream to query
+ * \param captureStatus - Returns the stream's capture status
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamCreate,
+ * ::cuStreamBeginCapture,
+ * ::cuStreamEndCapture
+ */
+CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+
+
+/**
+ * \brief Query a stream's capture state
+ *
+ * Query stream state related to stream capture.
+ *
+ * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created 
+ * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT.
+ *
+ * Valid data (other than capture status) is returned only if both of the following are true:
+ * - the call returns CUDA_SUCCESS
+ * - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE
+ *
+ * \param hStream - The stream to query
+ * \param captureStatus_out - Location to return the capture status of the stream; required
+ * \param id_out - Optional location to return an id for the capture sequence, which is
+ *           unique over the lifetime of the process
+ * \param graph_out - Optional location to return the graph being captured into. All
+ *           operations other than destroy and node removal are permitted on the graph
+ *           while the capture sequence is in progress. This API does not transfer
+ *           ownership of the graph, which is transferred or destroyed at
+ *           ::cuStreamEndCapture. Note that the graph handle may be invalidated before
+ *           end of capture for certain errors. Nodes that are or become
+ *           unreachable from the original stream at ::cuStreamEndCapture due to direct
+ *           actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED.
+ * \param dependencies_out - Optional location to store a pointer to an array of nodes.
+ *           The next node to be captured in the stream will depend on this set of nodes,
+ *           absent operations such as event wait which modify this set. The array pointer
+ *           is valid until the next API call which operates on the stream or until the
+ *           capture is terminated. The node handles may be copied out and are valid until
+ *           they or the graph is destroyed. The driver-owned array may also be passed
+ *           directly to APIs that operate on the graph (not the stream) without copying.
+ * \param numDependencies_out - Optional location to store the size of the array
+ *           returned in dependencies_out.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamGetCaptureInfo_v3
+ * ::cuStreamBeginCapture,
+ * ::cuStreamIsCapturing,
+ * ::cuStreamUpdateCaptureDependencies
+ */
+CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out,
+        cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+
+/**
+ * \brief Query a stream's capture state (12.3+)
+ *
+ * Query stream state related to stream capture.
+ *
+ * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created 
+ * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT.
+ *
+ * Valid data (other than capture status) is returned only if both of the following are true:
+ * - the call returns CUDA_SUCCESS
+ * - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE
+ *
+ * If \p edgeData_out is non-NULL then \p dependencies_out must be as well. If
+ * \p dependencies_out is non-NULL and \p edgeData_out is NULL, but there is non-zero edge
+ * data for one or more of the current stream dependencies, the call will return
+ * ::CUDA_ERROR_LOSSY_QUERY.
+ *
+ * \param hStream - The stream to query
+ * \param captureStatus_out - Location to return the capture status of the stream; required
+ * \param id_out - Optional location to return an id for the capture sequence, which is
+ *           unique over the lifetime of the process
+ * \param graph_out - Optional location to return the graph being captured into. All
+ *           operations other than destroy and node removal are permitted on the graph
+ *           while the capture sequence is in progress. This API does not transfer
+ *           ownership of the graph, which is transferred or destroyed at
+ *           ::cuStreamEndCapture. Note that the graph handle may be invalidated before
+ *           end of capture for certain errors. Nodes that are or become
+ *           unreachable from the original stream at ::cuStreamEndCapture due to direct
+ *           actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED.
+ * \param dependencies_out - Optional location to store a pointer to an array of nodes.
+ *           The next node to be captured in the stream will depend on this set of nodes,
+ *           absent operations such as event wait which modify this set. The array pointer
+ *           is valid until the next API call which operates on the stream or until the
+ *           capture is terminated. The node handles may be copied out and are valid until
+ *           they or the graph is destroyed. The driver-owned array may also be passed
+ *           directly to APIs that operate on the graph (not the stream) without copying.
+ * \param edgeData_out - Optional location to store a pointer to an array of graph edge
+ *           data. This array parallels \c dependencies_out; the next node to be added
+ *           has an edge to \c dependencies_out[i] with annotation \c edgeData_out[i] for
+ *           each \c i. The array pointer is valid until the next API call which operates
+ *           on the stream or until the capture is terminated.
+ * \param numDependencies_out - Optional location to store the size of the array
+ *           returned in dependencies_out.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT,
+ * ::CUDA_ERROR_LOSSY_QUERY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamGetCaptureInfo
+ * ::cuStreamBeginCapture,
+ * ::cuStreamIsCapturing,
+ * ::cuStreamUpdateCaptureDependencies
+ */
+CUresult CUDAAPI cuStreamGetCaptureInfo_v3(CUstream hStream, CUstreamCaptureStatus *captureStatus_out,
+        cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out,
+        const CUgraphEdgeData **edgeData_out, size_t *numDependencies_out);
+
+/**
+ * \brief Update the set of dependencies in a capturing stream (11.3+)
+ *
+ * Modifies the dependency set of a capturing stream. The dependency set is the set
+ * of nodes that the next captured node in the stream will depend on.
+ *
+ * Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and
+ * ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to
+ * the API is added to the existing set or replaces it. A flags value of 0 defaults
+ * to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES.
+ *
+ * Nodes that are removed from the dependency set via this API do not result in
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at
+ * ::cuStreamEndCapture.
+ *
+ * Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing.
+ *
+ * This API is new in CUDA 11.3. Developers requiring compatibility across minor
+ * versions to CUDA 11.0 should not use this API or provide a fallback.
+ *
+ * \param hStream - The stream to update
+ * \param dependencies - The set of dependencies to add
+ * \param numDependencies - The size of the dependencies array
+ * \param flags - See above
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ *
+ * \sa
+ * ::cuStreamBeginCapture,
+ * ::cuStreamGetCaptureInfo,
+ */
+CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+
+/**
+ * \brief Update the set of dependencies in a capturing stream (12.3+)
+ *
+ * Modifies the dependency set of a capturing stream. The dependency set is the set
+ * of nodes that the next captured node in the stream will depend on along with the
+ * edge data for those dependencies.
+ *
+ * Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and
+ * ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to
+ * the API is added to the existing set or replaces it. A flags value of 0 defaults
+ * to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES.
+ *
+ * Nodes that are removed from the dependency set via this API do not result in
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at
+ * ::cuStreamEndCapture.
+ *
+ * Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing.
+ *
+ * \param hStream - The stream to update
+ * \param dependencies - The set of dependencies to add
+ * \param dependencyData - Optional array of data associated with each dependency.
+ * \param numDependencies - The size of the dependencies array
+ * \param flags - See above
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ *
+ * \sa
+ * ::cuStreamBeginCapture,
+ * ::cuStreamGetCaptureInfo,
+ */
+CUresult CUDAAPI cuStreamUpdateCaptureDependencies_v2(CUstream hStream, CUgraphNode *dependencies,
+    const CUgraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags);
+
+/**
+ * \brief Attach memory to a stream asynchronously
+ *
+ * Enqueues an operation in \p hStream to specify stream association of
+ * \p length bytes of memory starting from \p dptr. This function is a
+ * stream-ordered operation, meaning that it is dependent on, and will
+ * only take effect when, previous work in stream has completed. Any
+ * previous association is automatically replaced.
+ *
+ * \p dptr must point to one of the following types of memories:
+ * - managed memory declared using the __managed__ keyword or allocated with
+ *   ::cuMemAllocManaged.
+ * - a valid host-accessible region of system-allocated pageable memory. This
+ *   type of memory may only be specified if the device associated with the
+ *   stream reports a non-zero value for the device attribute
+ *   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ *
+ * For managed allocations, \p length must be either zero or the entire
+ * allocation's size. Both indicate that the entire allocation's stream
+ * association is being changed. Currently, it is not possible to change stream
+ * association for a portion of a managed allocation.
+ *
+ * For pageable host allocations, \p length must be non-zero.
+ *
+ * The stream association is specified using \p flags which must be
+ * one of ::CUmemAttach_flags.
+ * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed
+ * by any stream on any device.
+ * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee
+ * that it won't access the memory on the device from any stream on a device that
+ * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with
+ * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS,
+ * the program makes a guarantee that it will only access the memory on the device
+ * from \p hStream. It is illegal to attach singly to the NULL stream, because the
+ * NULL stream is a virtual global stream and not a specific stream. An error will
+ * be returned in this case.
+ *
+ * When memory is associated with a single stream, the Unified Memory system will
+ * allow CPU access to this memory region so long as all operations in \p hStream
+ * have completed, regardless of whether other streams are active. In effect,
+ * this constrains exclusive ownership of the managed memory region by
+ * an active GPU to per-stream activity instead of whole-GPU activity.
+ *
+ * Accessing memory on the device from streams that are not associated with
+ * it will produce undefined results. No error checking is performed by the
+ * Unified Memory system to ensure that kernels launched into other streams
+ * do not access this region.
+ *
+ * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync
+ * via events, synchronization or other means to ensure legal access to memory
+ * at all times. Data visibility and coherency will be changed appropriately
+ * for all kernels which follow a stream-association change.
+ *
+ * If \p hStream is destroyed while data is associated with it, the association is
+ * removed and the association reverts to the default visibility of the allocation
+ * as specified at ::cuMemAllocManaged. For __managed__ variables, the default
+ * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an
+ * asynchronous operation, and as a result, the change to default association won't
+ * happen until all work in the stream has completed.
+ *
+ * \param hStream - Stream in which to enqueue the attach operation
+ * \param dptr    - Pointer to memory (must be a pointer to managed memory or
+ *                  to a valid host-accessible region of system-allocated
+ *                  pageable memory)
+ * \param length  - Length of memory
+ * \param flags   - Must be one of ::CUmemAttach_flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuMemAllocManaged,
+ * ::cudaStreamAttachMemAsync
+ */
+CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
+
+/**
+ * \brief Determine status of a compute stream
+ *
+ * Returns ::CUDA_SUCCESS if all operations in the stream specified by
+ * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not.
+ *
+ * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
+ * is equivalent to having called ::cuStreamSynchronize().
+ *
+ * \param hStream - Stream to query status of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_READY
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamQuery
+ */
+CUresult CUDAAPI cuStreamQuery(CUstream hStream);
+
+/**
+ * \brief Wait until a stream's tasks are completed
+ *
+ * Waits until the device has completed all operations in the stream specified
+ * by \p hStream. If the context was created with the
+ * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the
+ * stream is finished with all of its tasks.
+ *
+ * \param hStream - Stream to wait for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamDestroy,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamSynchronize
+ */
+CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
+
+/**
+ * \brief Destroys a stream
+ *
+ * Destroys the stream specified by \p hStream.
+ *
+ * In case the device is still doing work in the stream \p hStream
+ * when ::cuStreamDestroy() is called, the function will return immediately
+ * and the resources associated with \p hStream will be released automatically
+ * once the device has completed all work in \p hStream.
+ *
+ * \param hStream - Stream to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamDestroy
+ */
+CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
+
+/**
+ * \brief Copies attributes from source stream to destination stream.
+ *
+ * Copies attributes from source stream \p src to destination stream \p dst.
+ * Both streams must have the same context.
+ *
+ * \param[out] dst Destination stream
+ * \param[in] src Source stream
+ * For list of attributes see ::CUstreamAttrID
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src);
+
+/**
+ * \brief Queries stream attribute.
+ *
+ * Queries attribute \p attr from \p hStream and stores it in corresponding
+ * member of \p value_out.
+ *
+ * \param[in] hStream
+ * \param[in] attr
+ * \param[out] value_out
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      CUstreamAttrValue *value_out);
+
+/**
+ * \brief Sets stream attribute.
+ *
+ * Sets attribute \p attr on \p hStream from corresponding attribute of
+ * \p value. The updated attribute will be applied to subsequent work
+ * submitted to the stream. It will not affect previously submitted work.
+ *
+ * \param[out] hStream
+ * \param[in] attr
+ * \param[in] value
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      const CUstreamAttrValue *value);
+
+/** @} */ /* END CUDA_STREAM */
+
+
+/**
+ * \defgroup CUDA_EVENT Event Management
+ *
+ * ___MANBRIEF___ event management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the event management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates an event
+ *
+ * Creates an event *phEvent for the current context with the flags specified via
+ * \p Flags. Valid flags include:
+ * - ::CU_EVENT_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
+ *   synchronization.  A CPU thread that uses ::cuEventSynchronize() to wait on
+ *   an event created with this flag will block until the event has actually
+ *   been recorded.
+ * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need
+ *   to record timing data.  Events created with this flag specified and
+ *   the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best
+ *   performance when used with ::cuStreamWaitEvent() and ::cuEventQuery().
+ * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an
+ *   interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must
+ *   be specified along with ::CU_EVENT_DISABLE_TIMING.
+ *
+ * \param phEvent - Returns newly created event
+ * \param Flags   - Event creation flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventCreate,
+ * ::cudaEventCreateWithFlags
+ */
+CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags);
+
+/**
+ * \brief Records an event
+ *
+ * Captures in \p hEvent the contents of \p hStream at the time of this call.
+ * \p hEvent and \p hStream must be from the same context otherwise
+ * ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
+ * examine or wait for completion of the work that was captured. Uses of
+ * \p hStream after this call do not modify \p hEvent. See note on default
+ * stream behavior for what is captured in the default case.
+ *
+ * ::cuEventRecord() can be called multiple times on the same event and
+ * will overwrite the previously captured state. Other APIs such as
+ * ::cuStreamWaitEvent() use the most recently captured state at the time
+ * of the API call, and are not affected by later calls to
+ * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an
+ * event represents an empty set of work, so for example ::cuEventQuery()
+ * would return ::CUDA_SUCCESS.
+ *
+ * \param hEvent  - Event to record
+ * \param hStream - Stream to record event for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventRecord,
+ * ::cuEventRecordWithFlags
+ */
+CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
+
+/**
+ * \brief Records an event
+ *
+ * Captures in \p hEvent the contents of \p hStream at the time of this call.
+ * \p hEvent and \p hStream must be from the same context otherwise
+ * ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
+ * examine or wait for completion of the work that was captured. Uses of
+ * \p hStream after this call do not modify \p hEvent. See note on default
+ * stream behavior for what is captured in the default case.
+ *
+ * ::cuEventRecordWithFlags() can be called multiple times on the same event and
+ * will overwrite the previously captured state. Other APIs such as
+ * ::cuStreamWaitEvent() use the most recently captured state at the time
+ * of the API call, and are not affected by later calls to
+ * ::cuEventRecordWithFlags(). Before the first call to ::cuEventRecordWithFlags(), an
+ * event represents an empty set of work, so for example ::cuEventQuery()
+ * would return ::CUDA_SUCCESS.
+ *
+ * flags include:
+ * - ::CU_EVENT_RECORD_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_RECORD_EXTERNAL: Event is captured in the graph as an external
+ *   event node when performing stream capture. This flag is invalid outside
+ *   of stream capture.
+ *
+ * \param hEvent  - Event to record
+ * \param hStream - Stream to record event for
+ * \param flags   - See ::CUevent_capture_flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cuEventRecord,
+ * ::cudaEventRecord
+ */
+CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags);
+
+/**
+ * \brief Queries an event's status
+ *
+ * Queries the status of all work currently captured by \p hEvent. See
+ * ::cuEventRecord() for details on what is captured by an event.
+ *
+ * Returns ::CUDA_SUCCESS if all captured work has been completed, or
+ * ::CUDA_ERROR_NOT_READY if any captured work is incomplete.
+ *
+ * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
+ * is equivalent to having called ::cuEventSynchronize().
+ *
+ * \param hEvent - Event to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_READY
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventQuery
+ */
+CUresult CUDAAPI cuEventQuery(CUevent hEvent);
+
+/**
+ * \brief Waits for an event to complete
+ *
+ * Waits until the completion of all work currently captured in \p hEvent.
+ * See ::cuEventRecord() for details on what is captured by an event.
+ *
+ * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC
+ * flag will cause the calling CPU thread to block until the event has
+ * been completed by the device.  If the ::CU_EVENT_BLOCKING_SYNC flag has
+ * not been set, then the CPU thread will busy-wait until the event has
+ * been completed by the device.
+ *
+ * \param hEvent - Event to wait for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventSynchronize
+ */
+CUresult CUDAAPI cuEventSynchronize(CUevent hEvent);
+
+/**
+ * \brief Destroys an event
+ *
+ * Destroys the event specified by \p hEvent.
+ *
+ * An event may be destroyed before it is complete (i.e., while
+ * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the
+ * call does not block on completion of the event, and any associated
+ * resources will automatically be released asynchronously at completion.
+ *
+ * \param hEvent - Event to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventElapsedTime,
+ * ::cudaEventDestroy
+ */
+CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
+
+/**
+ * \brief Computes the elapsed time between two events
+ *
+ * Computes the elapsed time between two events (in milliseconds with a
+ * resolution of around 0.5 microseconds).
+ *
+ * If either event was last recorded in a non-NULL stream, the resulting time
+ * may be greater than expected (even if both used the same stream handle). This
+ * happens because the ::cuEventRecord() operation takes place asynchronously
+ * and there is no guarantee that the measured latency is actually just between
+ * the two events. Any number of other different stream operations could execute
+ * in between the two measured events, thus altering the timing in a significant
+ * way.
+ *
+ * If ::cuEventRecord() has not been called on either event then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
+ * on both events but one or both of them has not yet been completed (that is,
+ * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
+ * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
+ * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
+ * ::CUDA_ERROR_INVALID_HANDLE.
+ * 
+ * Note there is a later version of this API, ::cuEventElapsedTime_v2. It will
+ * supplant this version in CUDA 13.0, which is retained for minor version compatibility.
+ *
+ * \param pMilliseconds - Time between \p hStart and \p hEnd in ms
+ * \param hStart        - Starting event
+ * \param hEnd          - Ending event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_READY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cudaEventElapsedTime
+ */
+CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+
+/**
+ * \brief Computes the elapsed time between two events
+ *
+ * Computes the elapsed time between two events (in milliseconds with a
+ * resolution of around 0.5 microseconds). Note this API is not guaranteed
+ * to return the latest errors for pending work. As such this API is intended to
+ * serve as an elapsed time calculation only and any polling for completion on the
+ * events to be compared should be done with ::cuEventQuery instead.
+ *
+ * If either event was last recorded in a non-NULL stream, the resulting time
+ * may be greater than expected (even if both used the same stream handle). This
+ * happens because the ::cuEventRecord() operation takes place asynchronously
+ * and there is no guarantee that the measured latency is actually just between
+ * the two events. Any number of other different stream operations could execute
+ * in between the two measured events, thus altering the timing in a significant
+ * way.
+ *
+ * If ::cuEventRecord() has not been called on either event then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
+ * on both events but one or both of them has not yet been completed (that is,
+ * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
+ * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
+ * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
+ * ::CUDA_ERROR_INVALID_HANDLE.
+ *
+ * \param pMilliseconds - Time between \p hStart and \p hEnd in ms
+ * \param hStart        - Starting event
+ * \param hEnd          - Ending event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_READY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cudaEventElapsedTime
+ */
+CUresult CUDAAPI cuEventElapsedTime_v2(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+
+/** @} */ /* END CUDA_EVENT */
+
+/**
+ * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability
+ *
+ * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the external resource interoperability functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+ /**
+ * \brief Imports an external memory object
+ *
+ * Imports an externally allocated memory object and returns
+ * a handle to that in \p extMem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure
+ * is defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
+            CUexternalMemoryHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                    const void *name;
+                } win32;
+                const void *nvSciBufObject;
+            } handle;
+            unsigned long long size;
+            unsigned int flags;
+        } CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type
+ * of handle being imported. ::CUexternalMemoryHandleType is
+ * defined as:
+ *
+ * \code
+        typedef enum CUexternalMemoryHandleType_enum {
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD          = 1,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32       = 2,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT   = 3,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP         = 4,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE     = 5,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE     = 6,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF           = 8,
+        } CUexternalMemoryHandleType;
+ * \endcode
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid
+ * file descriptor referencing a memory object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
+ * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a memory object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a memory object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ * be non-NULL and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * memory object are destroyed.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one
+ * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Heap object. This handle holds a reference to the underlying
+ * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Heap object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one
+ * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Resource object. This handle holds a reference to the
+ * underlying object. If
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Resource object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ * represent a valid shared NT handle that is returned by
+ * IDXGIResource1::CreateSharedHandle when referring to a
+ * ID3D11Resource object. If
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D11Resource object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ * represent a valid shared KMT handle that is returned by
+ * IDXGIResource::GetSharedHandle when referring to a
+ * ID3D11Resource object and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * must be NULL.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::nvSciBufObject must be non-NULL
+ * and reference a valid NvSciBuf object.
+ * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the
+ * application must use ::cuWaitExternalSemaphoresAsync or ::cuSignalExternalSemaphoresAsync
+ * as appropriate barriers to maintain coherence between CUDA and the other drivers.
+ * See ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC and ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
+ * for memory synchronization.
+ *
+ *
+ * The size of the memory object must be specified in
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size.
+ *
+ * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the
+ * resource is a dedicated resource. The definition of what a
+ * dedicated resource is outside the scope of this extension.
+ * This flag must be set if ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type
+ * is one of the following:
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
+ *
+ * \param extMem_out    - Returned handle to an external memory object
+ * \param memHandleDesc - Memory import handle descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ * \notefnerr
+ *
+ * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the
+ * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges
+ * as well as appropriate Vulkan pipeline barriers to maintain coherence between
+ * CPU and GPU. For more information on these APIs, please refer to "Synchronization
+ * and Cache Control" chapter from Vulkan specification.
+ *
+ * \sa ::cuDestroyExternalMemory,
+ * ::cuExternalMemoryGetMappedBuffer,
+ * ::cuExternalMemoryGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc);
+
+/**
+ * \brief Maps a buffer onto an imported memory object
+ *
+ * Maps a buffer onto an imported memory object and returns a device
+ * pointer in \p devPtr.
+ *
+ * The properties of the buffer being mapped must be described in
+ * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is
+ * defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
+            unsigned long long offset;
+            unsigned long long size;
+            unsigned int flags;
+        } CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in
+ * the memory object where the buffer's base address is.
+ * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer.
+ * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero.
+ *
+ * The offset and size have to be suitably aligned to match the
+ * requirements of the external API. Mapping two buffers whose ranges
+ * overlap may or may not result in the same virtual address being
+ * returned for the overlapped portion. In such cases, the application
+ * must ensure that all accesses to that region from the GPU are
+ * volatile. Otherwise writes made via one address are not guaranteed
+ * to be visible via the other address, even if they're issued by the
+ * same thread. It is recommended that applications map the combined
+ * range instead of mapping separate buffers and then apply the
+ * appropriate offsets to the returned pointer to derive the
+ * individual buffers.
+ *
+ * The returned pointer \p devPtr must be freed using ::cuMemFree.
+ *
+ * \param devPtr     - Returned device pointer to buffer
+ * \param extMem     - Handle to external memory object
+ * \param bufferDesc - Buffer descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalMemory,
+ * ::cuDestroyExternalMemory,
+ * ::cuExternalMemoryGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc);
+
+/**
+ * \brief Maps a CUDA mipmapped array onto an external memory object
+ *
+ * Maps a CUDA mipmapped array onto an external object and returns a
+ * handle to it in \p mipmap.
+ *
+ * The properties of the CUDA mipmapped array being mapped must be
+ * described in \p mipmapDesc. The structure
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
+            unsigned long long offset;
+            CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+            unsigned int numLevels;
+        } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the
+ * offset in the memory object where the base level of the mipmap
+ * chain is.
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes
+ * the format, dimensions and type of the base level of the mipmap
+ * chain. For further details on these parameters, please refer to the
+ * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped
+ * array is bound as a color target in the graphics API, then the flag
+ * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags.
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies
+ * the total number of levels in the mipmap chain.
+ *
+ * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1.
+ *
+ *
+ * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy.
+ *
+ * \param mipmap     - Returned CUDA mipmapped array
+ * \param extMem     - Handle to external memory object
+ * \param mipmapDesc - CUDA array descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalMemory,
+ * ::cuDestroyExternalMemory,
+ * ::cuExternalMemoryGetMappedBuffer
+ */
+CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc);
+
+/**
+ * \brief Destroys an external memory object.
+ *
+ * Destroys the specified external memory object. Any existing buffers
+ * and CUDA mipmapped arrays mapped onto this object must no longer be
+ * used and must be explicitly freed using ::cuMemFree and
+ * ::cuMipmappedArrayDestroy respectively.
+ *
+ * \param extMem - External memory object to be destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalMemory,
+ * ::cuExternalMemoryGetMappedBuffer,
+ * ::cuExternalMemoryGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem);
+
+/**
+ * \brief Imports an external semaphore
+ *
+ * Imports an externally allocated synchronization object and returns
+ * a handle to that in \p extSem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is
+ * defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
+            CUexternalSemaphoreHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                    const void *name;
+                } win32;
+                const void* NvSciSyncObj;
+            } handle;
+            unsigned int flags;
+        } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of
+ * handle being imported. ::CUexternalSemaphoreHandleType is defined
+ * as:
+ *
+ * \code
+        typedef enum CUexternalSemaphoreHandleType_enum {
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD                = 1,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32             = 2,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT         = 3,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE              = 4,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE              = 5,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC                = 6,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX        = 7,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT    = 8,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD    = 9,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10
+        } CUexternalSemaphoreHandleType;
+ * \endcode
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid
+ * file descriptor referencing a synchronization object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
+ * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a synchronization object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must
+ * be non-NULL and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * synchronization object are destroyed.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one
+ * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Fence object. This handle holds a reference to the underlying
+ * object. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid ID3D12Fence object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * represents a valid shared NT handle that is returned by
+ * ID3D11Fence::CreateSharedHandle. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid ID3D11Fence object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::nvSciSyncObj
+ * represents a valid NvSciSyncObj.
+ *
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * represents a valid shared NT handle that
+ * is returned by IDXGIResource1::CreateSharedHandle when referring to
+ * a IDXGIKeyedMutex object. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid IDXGIKeyedMutex object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * represents a valid shared KMT handle that
+ * is returned by IDXGIResource::GetSharedHandle when referring to
+ * a IDXGIKeyedMutex object and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must be NULL.
+ * 
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid
+ * file descriptor referencing a synchronization object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ * 
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, then exactly one
+ * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a synchronization object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object.
+ *
+ * \param extSem_out    - Returned handle to an external semaphore
+ * \param semHandleDesc - Semaphore import handle descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ * \notefnerr
+ *
+ * \sa ::cuDestroyExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc);
+
+/**
+ * \brief Signals a set of external semaphore objects
+ *
+ * Enqueues a signal operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of signaling a semaphore depends on the type of
+ * the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+ * then signaling the semaphore will set it to the signaled state.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
+ * then the semaphore will be set to the value specified in
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value.
+ *
+ * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC
+ * this API sets ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence
+ * to a value that can be used by subsequent waiters of the same NvSciSync object
+ * to order operations with those currently submitted in \p stream. Such an update
+ * will overwrite previous contents of
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence. By default,
+ * signaling such an external semaphore object causes appropriate memory synchronization
+ * operations to be performed over all external memory objects that are imported as
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that any subsequent accesses
+ * made by other importers of the same set of NvSciBuf memory object(s) are coherent.
+ * These operations can be skipped by specifying the flag
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC, which can be used as a
+ * performance optimization when data coherency is not required. But specifying this
+ * flag in scenarios where data coherency is required results in undefined behavior.
+ * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
+ * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return
+ * CUDA_ERROR_NOT_SUPPORTED.
+ * NvSciSyncFence associated with semaphore object of the type 
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC can be deterministic. For this the 
+ * NvSciSyncAttrList used to create the semaphore object must have value of 
+ * NvSciSyncAttrKey_RequireDeterministicFences key set to true. Deterministic fences 
+ * allow users to enqueue a wait over the semaphore object even before corresponding
+ * signal is enqueued. For such a semaphore object, CUDA guarantees that each signal 
+ * operation will increment the fence value by '1'. Users are expected to track count 
+ * of signals enqueued on the semaphore object and insert waits accordingly. When such 
+ * a semaphore object is signaled from multiple streams, due to concurrent stream 
+ * execution, it is possible that the order in which the semaphore gets signaled is 
+ * indeterministic. This could lead to waiters of the semaphore getting unblocked 
+ * incorrectly. Users are expected to handle such situations, either by not using the 
+ * same semaphore object with deterministic fence support enabled in different streams 
+ * or by adding explicit dependency amongst such streams so that the semaphore is 
+ * signaled in order.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
+ * then the keyed mutex will be released with the key specified in
+ * ::CUDA_EXTERNAL_SEMAPHORE_PARAMS::params::keyedmutex::key.
+ *
+ * \param extSemArray - Set of external semaphores to be signaled
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to signal
+ * \param stream      - Stream to enqueue the signal operations in
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalSemaphore,
+ * ::cuDestroyExternalSemaphore,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+
+/**
+ * \brief Waits on a set of external semaphore objects
+ *
+ * Enqueues a wait operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of waiting on a semaphore depends on the type
+ * of the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+ * then waiting on the semaphore will wait until the semaphore reaches
+ * the signaled state. The semaphore will then be reset to the
+ * unsignaled state. Therefore for every signal operation, there can
+ * only be one wait operation.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
+ * then waiting on the semaphore will wait until the value of the
+ * semaphore is greater than or equal to
+ * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value.
+ *
+ * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC
+ * then, waiting on the semaphore will wait until the
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence is signaled by the
+ * signaler of the NvSciSyncObj that was associated with this semaphore object.
+ * By default, waiting on such an external semaphore object causes appropriate
+ * memory synchronization operations to be performed over all external memory objects
+ * that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that
+ * any subsequent accesses made by other importers of the same set of NvSciBuf memory
+ * object(s) are coherent. These operations can be skipped by specifying the flag
+ * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC, which can be used as a
+ * performance optimization when data coherency is not required. But specifying this
+ * flag in scenarios where data coherency is required results in undefined behavior.
+ * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
+ * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_WAIT, this API will return
+ * CUDA_ERROR_NOT_SUPPORTED.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
+ * then the keyed mutex will be acquired when it is released with the key 
+ * specified in ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::key 
+ * or until the timeout specified by
+ * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::timeoutMs
+ * has lapsed. The timeout interval can either be a finite value
+ * specified in milliseconds or an infinite value. In case an infinite
+ * value is specified the timeout never elapses. The windows INFINITE
+ * macro must be used to specify infinite timeout.
+ *
+ * \param extSemArray - External semaphores to be waited on
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to wait on
+ * \param stream      - Stream to enqueue the wait operations in
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_TIMEOUT
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalSemaphore,
+ * ::cuDestroyExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+
+/**
+ * \brief Destroys an external semaphore
+ *
+ * Destroys an external semaphore object and releases any references
+ * to the underlying resource. Any outstanding signals or waits must
+ * have completed before the semaphore is destroyed.
+ *
+ * \param extSem - External semaphore to be destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
+
+/** @} */ /* END CUDA_EXTRES_INTEROP */
+
+/**
+ * \defgroup CUDA_MEMOP Stream Memory Operations
+ *
+ * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream memory operations of the low-level CUDA
+ * driver application programming interface.
+ *
+ * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2.
+ *
+ * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64()
+ * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and
+ * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+ *
+ * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and
+ * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform
+ * hardware features and can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.
+ *
+ * Note that all memory pointers passed as parameters to these operations
+ * are device pointers. Where necessary a device pointer should be
+ * obtained, for example with ::cuMemHostGetDevicePointer().
+ *
+ * None of the operations accepts pointers to managed memory buffers
+ * (::cuMemAllocManaged).
+ *
+ * \note
+ * Warning:
+ * Improper use of these APIs may deadlock the application. Synchronization 
+ * ordering established through these APIs is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by these APIs should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order.
+ *
+ * @{
+ */
+
+/**
+ * \brief Wait on a memory location
+ *
+ * Enqueues a synchronization of the stream on the given memory location. Work
+ * ordered after the operation will block until the given condition on the
+ * memory is satisfied. By default, the condition is to wait for
+ * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal.
+ * Other condition types can be specified via \p flags.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
+ * be used with managed memory (::cuMemAllocManaged).
+ *
+ * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2.
+ *
+ * \note
+ * Warning:
+ * Improper use of this API may deadlock the application. Synchronization 
+ * ordering established through this API is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by this API should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order.
+ *
+ * \param stream The stream to synchronize on the memory location.
+ * \param addr The memory location to wait on.
+ * \param value The value to compare with the memory location.
+ * \param flags See ::CUstreamWaitValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue64,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWriteValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+
+/**
+ * \brief Wait on a memory location
+ *
+ * Enqueues a synchronization of the stream on the given memory location. Work
+ * ordered after the operation will block until the given condition on the
+ * memory is satisfied. By default, the condition is to wait for
+ * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal.
+ * Other condition types can be specified via \p flags.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer().
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+ *
+ * \note
+ * Warning:
+ * Improper use of this API may deadlock the application. Synchronization 
+ * ordering established through this API is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by this API should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order.
+ *
+ * \param stream The stream to synchronize on the memory location.
+ * \param addr The memory location to wait on.
+ * \param value The value to compare with the memory location.
+ * \param flags See ::CUstreamWaitValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue32,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWriteValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+
+/**
+ * \brief Write a value to memory
+ *
+ * Write a value to memory.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
+ * be used with managed memory (::cuMemAllocManaged).
+ *
+ * \param stream The stream to do the write in.
+ * \param addr The device address to write to.
+ * \param value The value to write.
+ * \param flags See ::CUstreamWriteValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWriteValue64,
+ * ::cuStreamWaitValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuEventRecord
+ */
+CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+
+/**
+ * \brief Write a value to memory
+ *
+ * Write a value to memory.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer().
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+ *
+ * \param stream The stream to do the write in.
+ * \param addr The device address to write to.
+ * \param value The value to write.
+ * \param flags See ::CUstreamWriteValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWriteValue32,
+ * ::cuStreamWaitValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuEventRecord
+ */
+CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+
+/**
+ * \brief Batch operations to synchronize the stream via memory operations
+ *
+ * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32().
+ * Batching operations may avoid some performance overhead in both the API call
+ * and the device execution versus adding them to the stream in separate API
+ * calls. The operations are enqueued in the order they appear in the array.
+ *
+ * See ::CUstreamBatchMemOpType for the full set of supported operations, and
+ * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(),
+ * and ::cuStreamWriteValue64() for details of specific operations.
+ *
+ * See related APIs for details on querying support for specific operations.
+ *
+ * \note
+ * Warning:
+ * Improper use of this API may deadlock the application. Synchronization 
+ * ordering established through this API is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by this API should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order. For more 
+ * information, see the Stream Memory Operations section in the programming 
+ * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
+ *
+ * \param stream The stream to enqueue the operations in.
+ * \param count The number of operations in the array. Must be less than 256.
+ * \param paramArray The types and parameters of the individual operations.
+ * \param flags Reserved for future expansion; must be 0.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWriteValue64,
+ * ::cuMemHostRegister
+ */
+CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+
+/** @} */ /* END CUDA_MEMOP */
+
+/**
+ * \defgroup CUDA_EXEC Execution Control
+ *
+ * ___MANBRIEF___ execution control functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the execution control functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns information about a function
+ *
+ * Returns in \p *pi the integer value of the attribute \p attrib on the kernel
+ * given by \p hfunc. The supported attributes are:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads
+ *   per block, beyond which a launch of the function would fail. This number
+ *   depends on both the function and the device on which the function is
+ *   currently loaded.
+ * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of
+ *   statically-allocated shared memory per block required by this function.
+ *   This does not include dynamically-allocated shared memory requested by
+ *   the user at runtime.
+ * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated
+ *   constant memory required by this function.
+ * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory
+ *   used by each thread of this function.
+ * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread
+ *   of this function.
+ * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for
+ *   which the function was compiled. This value is the major PTX version * 10
+ *   + the minor PTX version, so a PTX version 1.3 function would return the
+ *   value 13. Note that this may return the undefined value of 0 for cubins
+ *   compiled prior to CUDA 3.0.
+ * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for
+ *   which the function was compiled. This value is the major binary
+ *   version * 10 + the minor binary version, so a binary version 1.3 function
+ *   would return the value 13. Note that this will return a value of 10 for
+ *   legacy cubins that do not have a properly-encoded binary architecture
+ *   version.
+ * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has
+ *   been compiled with user specified option "-Xptxas --dlcm=ca" set .
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of
+ *   dynamically-allocated shared memory.
+ * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1
+ *   cache split ratio in percent of total shared memory.
+ * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET: If this attribute is set, the
+ *   kernel must launch with a valid cluster size specified.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in
+ *   blocks.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in
+ *   blocks.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in
+ *   blocks.
+ * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
+ *   the function can be launched with non-portable cluster size. 1 is allowed,
+ *   0 is disallowed. A non-portable cluster size may only function on the
+ *   specific SKUs the program is tested on. The launch might fail if the
+ *   program is run on a different hardware platform. CUDA API provides
+ *   cudaOccupancyMaxActiveClusters to assist with checking whether the desired
+ *   size can be launched on the current device. A portable cluster size is
+ *   guaranteed to be functional on all compute capabilities higher than the
+ *   target compute capability. The portable cluster size for sm_90 is 8 blocks
+ *   per cluster. This value may increase for future compute capabilities. The
+ *   specific hardware unit may support higher cluster sizes that’s not
+ *   guaranteed to be portable.
+ * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
+ *   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
+ *
+ * With a few execeptions, function attributes may also be queried on unloaded
+ * function handles returned from ::cuModuleEnumerateFunctions.
+ * ::CUDA_ERROR_FUNCTION_NOT_LOADED is returned if the attribute requires a fully
+ * loaded function but the function is not loaded. The loading state of a function
+ * may be queried using ::cuFuncIsloaded. ::cuFuncLoad may be called to explicitly
+ * load a function before querying the following attributes that require the function
+ * to be loaded:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
+ * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+ *
+ * \param pi     - Returned attribute value
+ * \param attrib - Attribute requested
+ * \param hfunc  - Function to query attribute of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_FUNCTION_NOT_LOADED
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuLaunchKernel,
+ * ::cudaFuncGetAttributes,
+ * ::cudaFuncSetAttribute,
+ * ::cuFuncIsLoaded,
+ * ::cuFuncLoad,
+ * ::cuKernelGetAttribute
+ */
+CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+
+/**
+ * \brief Sets information about a function
+ *
+ * This call sets the value of a specified attribute \p attrib on the kernel given
+ * by \p hfunc to an integer value specified by \p val
+ * This function returns CUDA_SUCCESS if the new value of the attribute could be
+ * successfully set. If the set fails, this call will return an error.
+ * Not all attributes can have values set. Attempting to set a value on a read-only
+ * attribute will result in an error (CUDA_ERROR_INVALID_VALUE)
+ *
+ * Supported attributes for the cuFuncSetAttribute call are:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of
+ *   dynamically-allocated shared memory. The value should contain the requested
+ *   maximum size of dynamically-allocated shared memory. The sum of this value and
+ *   the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the
+ *   device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN.
+ *   The maximal size of requestable dynamic shared memory may differ by GPU
+ *   architecture.
+ * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1
+ *   cache and shared memory use the same hardware resources, this sets the shared memory
+ *   carveout preference, in percent of the total shared memory. 
+ *   See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR
+ *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
+ *   the function can be launched with non-portable cluster size. 1 is allowed,
+ *   0 is disallowed.
+ * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
+ *   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
+ *
+ * \param hfunc  - Function to query attribute of
+ * \param attrib - Attribute requested
+ * \param value   - The value to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuLaunchKernel,
+ * ::cudaFuncGetAttributes,
+ * ::cudaFuncSetAttribute,
+ * ::cuKernelSetAttribute
+ */
+CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value);
+
+/**
+ * \brief Sets the preferred cache configuration for a device function
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p config the preferred cache configuration for
+ * the device function \p hfunc. This is only a preference. The driver will use
+ * the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute \p hfunc.  Any context-wide preference
+ * set via ::cuCtxSetCacheConfig() will be overridden by this per-function
+ * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In
+ * that case, the current context-wide setting will be used.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \param hfunc  - Kernel to configure cache for
+ * \param config - Requested cache configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchKernel,
+ * ::cudaFuncSetCacheConfig,
+ * ::cuKernelSetCacheConfig
+ */
+CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
+
+
+/**
+ * \brief Returns a module handle
+ *
+ * Returns in \p *hmod the handle of the module that function \p hfunc
+ * is located in. The lifetime of the module corresponds to the lifetime of
+ * the context it was loaded in or until the module is explicitly unloaded.
+ *
+ * The CUDA runtime manages its own modules loaded into the primary context.
+ * If the handle returned by this API refers to a module loaded by the CUDA runtime,
+ * calling ::cuModuleUnload() on that module will result in undefined behavior.
+ *
+ * \param hmod - Returned module handle
+ * \param hfunc   - Function to retrieve module for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ */
+CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc);
+
+/**
+ * \brief Returns the function name for a ::CUfunction handle
+ *
+ * Returns in \p **name the function name associated with the function handle \p hfunc .
+ * The function name is returned as a null-terminated string. The returned name is only 
+ * valid when the function handle is valid. If the module is unloaded or reloaded, one 
+ * must call the API again to get the updated name. This API may return a mangled name if
+ * the function is not declared as having C linkage. If either \p **name or \p hfunc 
+ * is NULL, ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param name - The returned name of the function
+ * \param hfunc - The function handle to retrieve the name for 
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ */
+CUresult CUDAAPI cuFuncGetName(const char **name, CUfunction hfunc);
+
+/**
+ * \brief Returns the offset and size of a kernel parameter in the device-side parameter layout
+ *
+ * Queries the kernel parameter at \p paramIndex into \p func's list of parameters, and returns
+ * in \p paramOffset and \p paramSize the offset and size, respectively, where the parameter
+ * will reside in the device-side parameter layout. This information can be used to update kernel
+ * node parameters from the device via ::cudaGraphKernelNodeSetParam() and
+ * ::cudaGraphKernelNodeUpdatesApply(). \p paramIndex must be less than the number of parameters
+ * that \p func takes. \p paramSize can be set to NULL if only the parameter offset is desired.
+ *
+ * \param func        - The function to query
+ * \param paramIndex  - The parameter index to query
+ * \param paramOffset - Returns the offset into the device-side parameter layout at which the parameter resides
+ * \param paramSize   - Optionally returns the size of the parameter in the device-side parameter layout
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+* \sa ::cuKernelGetParamInfo
+ */
+CUresult CUDAAPI cuFuncGetParamInfo(CUfunction func, size_t paramIndex, size_t *paramOffset, size_t *paramSize);
+
+typedef enum CUfunctionLoadingState_enum {
+    CU_FUNCTION_LOADING_STATE_UNLOADED = 0,
+    CU_FUNCTION_LOADING_STATE_LOADED = 1,
+    CU_FUNCTION_LOADING_STATE_MAX
+} CUfunctionLoadingState;
+
+/**
+ * \brief Returns if the function is loaded
+ *
+ * Returns in \p state the loading state of \p function.
+ *
+ * \param state - returned loading state
+ * \param function - the function to check
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuFuncLoad,
+ * ::cuModuleEnumerateFunctions
+ */
+CUresult CUDAAPI cuFuncIsLoaded(CUfunctionLoadingState *state, CUfunction function);
+
+/**
+ * \brief Loads a function
+ *
+ * Finalizes function loading for \p function. Calling this API with a
+ * fully loaded function has no effect.
+ *
+ * \param function - the function to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuModuleEnumerateFunctions,
+ * ::cuFuncIsLoaded
+ */
+CUresult CUDAAPI cuFuncLoad(CUfunction function);
+
+/**
+ * \brief Launches a CUDA function ::CUfunction or a CUDA kernel ::CUkernel
+ *
+ * Invokes the function ::CUfunction or the kernel ::CUkernel \p f
+ * on a \p gridDimX x \p gridDimY x \p gridDimZ grid of blocks.
+ * Each block contains \p blockDimX x \p blockDimY x
+ * \p blockDimZ threads.
+ *
+ * \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * Kernel parameters to \p f can be specified in one of two ways:
+ *
+ * 1) Kernel parameters can be specified via \p kernelParams.  If \p f
+ * has N parameters, then \p kernelParams needs to be an array of N
+ * pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
+ * must point to a region of memory from which the actual kernel
+ * parameter will be copied.  The number of kernel parameters and their
+ * offsets and sizes do not need to be specified as that information is
+ * retrieved directly from the kernel's image.
+ *
+ * 2) Kernel parameters can also be packaged by the application into
+ * a single buffer that is passed in via the \p extra parameter.
+ * This places the burden on the application of knowing each kernel
+ * parameter's size and alignment/padding within the buffer.  Here is
+ * an example of using the \p extra parameter in this manner:
+ * \code
+    size_t argBufferSize;
+    char argBuffer[256];
+
+    // populate argBuffer and argBufferSize
+
+    void *config[] = {
+        CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
+        CU_LAUNCH_PARAM_BUFFER_SIZE,    &argBufferSize,
+        CU_LAUNCH_PARAM_END
+    };
+    status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
+ * \endcode
+ *
+ * The \p extra parameter exists to allow ::cuLaunchKernel to take
+ * additional less commonly used arguments.  \p extra specifies a list of
+ * names of extra settings and their corresponding values.  Each extra
+ * setting name is immediately followed by the corresponding value.  The
+ * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END.
+ *
+ * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
+ *   array;
+ * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
+ *   value in \p extra will be a pointer to a buffer containing all
+ *   the kernel parameters for launching kernel \p f;
+ * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
+ *   value in \p extra will be a pointer to a size_t containing the
+ *   size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER;
+ *
+ * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel
+ * parameters are specified with both \p kernelParams and \p extra
+ * (i.e. both \p kernelParams and \p extra are non-NULL).
+ *
+ * Calling ::cuLaunchKernel() invalidates the persistent function state
+ * set through the following deprecated APIs:
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(),
+ *  ::cuParamSetv().
+ *
+ * Note that to use ::cuLaunchKernel(), the kernel \p f must either have
+ * been compiled with toolchain version 3.2 or later so that it will
+ * contain kernel parameter information, or have no kernel parameters.
+ * If either of these conditions is not met, then ::cuLaunchKernel() will
+ * return ::CUDA_ERROR_INVALID_IMAGE.
+ *
+ * Note that the API can also be used to launch context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to launch
+ * the kernel on will either be taken from the specified stream \p hStream
+ * or the current context in case of NULL stream.
+ *
+ * \param f              - Function ::CUfunction or Kernel ::CUkernel to launch
+ * \param gridDimX       - Width of grid in blocks
+ * \param gridDimY       - Height of grid in blocks
+ * \param gridDimZ       - Depth of grid in blocks
+ * \param blockDimX      - X dimension of each thread block
+ * \param blockDimY      - Y dimension of each thread block
+ * \param blockDimZ      - Z dimension of each thread block
+ * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
+ * \param hStream        - Stream identifier
+ * \param kernelParams   - Array of pointers to kernel parameters
+ * \param extra          - Extra options
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cudaLaunchKernel,
+ * ::cuLibraryGetKernel,
+ * ::cuKernelSetCacheConfig,
+ * ::cuKernelGetAttribute,
+ * ::cuKernelSetAttribute
+ */
+CUresult CUDAAPI cuLaunchKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams,
+                                void **extra);
+
+/**
+ * \brief Launches a CUDA function ::CUfunction or a CUDA kernel ::CUkernel with launch-time configuration
+ *
+ * Invokes the function ::CUfunction or the kernel ::CUkernel \p f with the specified launch-time configuration
+ * \p config.
+ *
+ * The ::CUlaunchConfig structure is defined as:
+ *
+ * \code
+ *       typedef struct CUlaunchConfig_st {
+ *     unsigned int gridDimX;
+ *     unsigned int gridDimY;
+ *     unsigned int gridDimZ;
+ *     unsigned int blockDimX;
+ *     unsigned int blockDimY;
+ *     unsigned int blockDimZ;
+ *     unsigned int sharedMemBytes;
+ *     CUstream hStream;
+ *     CUlaunchAttribute *attrs;
+ *     unsigned int numAttrs;
+ * } CUlaunchConfig;
+ * \endcode
+ *
+ * where:
+ * - ::CUlaunchConfig::gridDimX is the width of the grid in blocks.
+ * - ::CUlaunchConfig::gridDimY is the height of the grid in blocks.
+ * - ::CUlaunchConfig::gridDimZ is the depth of the grid in blocks.
+ * - ::CUlaunchConfig::blockDimX is the X dimension of each thread block.
+ * - ::CUlaunchConfig::blockDimX is the Y dimension of each thread block.
+ * - ::CUlaunchConfig::blockDimZ is the Z dimension of each thread block.
+ * - ::CUlaunchConfig::sharedMemBytes is the dynamic shared-memory size per
+ *   thread block in bytes.
+ * - ::CUlaunchConfig::hStream is the handle to the stream to perform the launch
+ *   in. The CUDA context associated with this stream must match that associated
+ *   with function f.
+ * - ::CUlaunchConfig::attrs is an array of ::CUlaunchConfig::numAttrs
+ *   continguous ::CUlaunchAttribute elements. The value of this pointer is not
+ *   considered if ::CUlaunchConfig::numAttrs is zero. However, in that case, it
+ *   is recommended to set the pointer to NULL.
+ * - ::CUlaunchConfig::numAttrs is the number of attributes populating the
+ *   first ::CUlaunchConfig::numAttrs positions of the ::CUlaunchConfig::attrs
+ *   array.
+ *
+ * Launch-time configuration is specified by adding entries to
+ * ::CUlaunchConfig::attrs. Each entry is an attribute ID and a corresponding
+ * attribute value.
+ *
+ * The ::CUlaunchAttribute structure is defined as:
+ * \code
+ *       typedef struct CUlaunchAttribute_st {
+ *     CUlaunchAttributeID id;
+ *     CUlaunchAttributeValue value;
+ * } CUlaunchAttribute;
+ * \endcode
+ * where:
+ * - ::CUlaunchAttribute::id is a unique enum identifying the attribute.
+ * - ::CUlaunchAttribute::value is a union that hold the attribute value.
+ *
+ * An example of using the \p config parameter:
+ * \code
+ *       CUlaunchAttribute coopAttr = {.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE,
+ *                               .value = 1};
+ * CUlaunchConfig config = {... // set block and grid dimensions
+ *                        .attrs = &coopAttr,
+ *                        .numAttrs = 1};
+ *
+ * cuLaunchKernelEx(&config, kernel, NULL, NULL);
+ * \endcode
+ *
+ * The ::CUlaunchAttributeID enum is defined as:
+ * \code
+ *       typedef enum CUlaunchAttributeID_enum {
+ *     CU_LAUNCH_ATTRIBUTE_IGNORE = 0,
+ *     CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW   = 1,
+ *     CU_LAUNCH_ATTRIBUTE_COOPERATIVE            = 2,
+ *     CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3,
+ *     CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION                    = 4,
+ *     CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 5,
+ *     CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION    = 6,
+ *     CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT                   = 7,
+ *     CU_LAUNCH_ATTRIBUTE_PRIORITY               = 8,
+ *     CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP    = 9,
+ *     CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN        = 10,
+ *     CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION = 11,
+ *     CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT = 12,
+ *     CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE = 13,
+ * } CUlaunchAttributeID;
+ * \endcode
+ *
+ * and the corresponding ::CUlaunchAttributeValue union as :
+ * \code
+ *       typedef union CUlaunchAttributeValue_union {
+ *     CUaccessPolicyWindow accessPolicyWindow;
+ *     int cooperative;
+ *     CUsynchronizationPolicy syncPolicy;
+ *     struct {
+ *         unsigned int x;
+ *         unsigned int y;
+ *         unsigned int z;
+ *     } clusterDim;
+ *     CUclusterSchedulingPolicy clusterSchedulingPolicyPreference;
+ *     int programmaticStreamSerializationAllowed;
+ *     struct {
+ *         CUevent event;
+ *         int flags;
+ *         int triggerAtBlockStart;
+ *     } programmaticEvent;
+ *     int priority;
+ *     CUlaunchMemSyncDomainMap memSyncDomainMap;
+ *     CUlaunchMemSyncDomain memSyncDomain;
+ *     struct {
+ *         unsigned int x;
+ *         unsigned int y;
+ *         unsigned int z;
+ *     } preferredClusterDim;
+ *     struct {
+ *         CUevent event;
+ *         int flags;
+ *     } launchCompletionEvent;
+ *     struct {
+ *         int deviceUpdatable;
+ *         CUgraphDeviceNode devNode;
+ *     } deviceUpdatableKernelNode;
+ * } CUlaunchAttributeValue;
+ * \endcode
+ *
+ * Setting ::CU_LAUNCH_ATTRIBUTE_COOPERATIVE to a non-zero value causes the
+ * kernel launch to be a cooperative launch, with exactly the same usage and
+ * semantics of ::cuLaunchCooperativeKernel.
+ *
+ * Setting ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION to a non-zero
+ * values causes the kernel to use programmatic means to resolve its stream
+ * dependency -- enabling the CUDA runtime to opportunistically allow the grid's
+ * execution to overlap with the previous kernel in the stream, if that kernel
+ * requests the overlap.
+ *
+ * ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT records an event along with the
+ * kernel launch. Event recorded through this launch attribute is guaranteed to
+ * only trigger after all block in the associated kernel trigger the event. A
+ * block can trigger the event through PTX launchdep.release or CUDA builtin
+ * function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be
+ * inserted at the beginning of each block's execution if triggerAtBlockStart is
+ * set to non-0. Note that dependents (including the CPU thread calling
+ * cuEventSynchronize()) are not guaranteed to observe the release precisely
+ * when it is released. For example, cuEventSynchronize() may only observe the
+ * event trigger long after the associated kernel has completed. This recording
+ * type is primarily meant for establishing programmatic dependency between
+ * device tasks. The event supplied must not be an interprocess or interop
+ * event. The event must disable timing (i.e. created with
+ * ::CU_EVENT_DISABLE_TIMING flag set).
+ *
+ * ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT records an event along with
+ * the kernel launch. Nominally, the event is triggered once all blocks of the
+ * kernel have begun execution. Currently this is a best effort. If a kernel B
+ * has a launch completion dependency on a kernel A, B may wait until A is
+ * complete. Alternatively, blocks of B may begin before all blocks of A have
+ * begun, for example:
+ *
+ *  - If B can claim execution resources unavaiable to A, for example if they
+ *    run on different GPUs.
+ *  - If B is a higher priority than A.
+ *
+ * Exercise caution if such an ordering inversion could lead to deadlock. The
+ * event supplied must not be an interprocess or interop event. The event must
+ * disable timing (i.e. must be created with the ::CU_EVENT_DISABLE_TIMING flag
+ * set).
+ *
+ * Setting ::CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE to 1
+ * on a captured launch causes the resulting kernel node to be device-updatable.
+ * This attribute is specific to graphs, and passing it to a launch in a
+ * non-capturing stream results in an error. Passing a value other than 0 or 1 is
+ * not allowed.
+ *
+ * On success, a handle will be returned via
+ * ::CUlaunchAttributeValue::deviceUpdatableKernelNode::devNode which can be passed
+ * to the various device-side update functions to update the node's kernel parameters
+ * from within another kernel. For more information on the types of device updates
+ * that can be made, as well as the relevant limitations thereof, see
+ * ::cudaGraphKernelNodeUpdatesApply.
+ *
+ * Kernel nodes which are device-updatable have additional restrictions compared to regular
+ * kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via
+ * ::cuGraphDestroyNode. Additionally, once opted-in to this functionality, a node cannot
+ * opt out, and any attempt to set the attribute to 0 will result in an error. Graphs
+ * containing one or more device-updatable node also do not allow multiple instantiation.
+ *
+ * ::CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION allows the kernel launch to
+ * specify a preferred substitute cluster dimension. Blocks may be grouped
+ * according to either the dimensions specified with this attribute (grouped
+ * into a "preferred substitute cluster"), or the one specified with
+ * ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION attribute (grouped into a "regular
+ * cluster"). The cluster dimensions of a "preferred substitute cluster" shall
+ * be an integer multiple greater than zero of the regular cluster dimensions.
+ * The device will attempt - on a best-effort basis - to group thread blocks
+ * into preferred clusters over grouping them into regular clusters. When it
+ * deems necessary (primarily when the device temporarily runs out of physical
+ * resources to launch the larger preferred clusters), the device may switch to
+ * launch the regular clusters instead to attempt to utilize as much of the
+ * physical device resources as possible.
+ *
+ * Each type of cluster will have its enumeration / coordinate setup as if the
+ * grid consists solely of its type of cluster. For example, if the preferred
+ * substitute cluster dimensions double the regular cluster dimensions, there
+ * might be simultaneously a regular cluster indexed at (1,0,0), and a preferred
+ * cluster indexed at (1,0,0). In this example, the preferred substitute cluster
+ * (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their
+ * blocks.
+ *
+ * This attribute will only take effect when a regular cluster dimension has
+ * been specified. The preferred substitute The preferred substitute cluster
+ * dimension must be an integer multiple greater than zero of the regular
+ * cluster dimension and must divide the grid. It must also be no more than
+ * `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`.
+ * Otherwise it must be less than the maximum value the driver can support.
+ * Otherwise, setting this attribute to a value physically unable to fit on any
+ * particular device is permitted.
+ *
+ * The effect of other attributes is consistent with their effect when set via
+ * persistent APIs.
+ *
+ * See ::cuStreamSetAttribute for
+ * - ::CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW
+ * - ::CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY
+ *
+ * See ::cuFuncSetAttribute for
+ * - ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
+ * - ::CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE
+ *
+ * Kernel parameters to \p f can be specified in the same ways that they can be
+ * using ::cuLaunchKernel.
+ *
+ * Note that the API can also be used to launch context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to launch
+ * the kernel on will either be taken from the specified stream ::CUlaunchConfig::hStream
+ * or the current context in case of NULL stream.
+ *
+ * \param config         - Config to launch
+ * \param f              - Function ::CUfunction or Kernel ::CUkernel to launch
+ * \param kernelParams   - Array of pointers to kernel parameters
+ * \param extra          - Extra options
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cudaLaunchKernel,
+ * ::cudaLaunchKernelEx,
+ * ::cuLibraryGetKernel,
+ * ::cuKernelSetCacheConfig,
+ * ::cuKernelGetAttribute,
+ * ::cuKernelSetAttribute
+ */
+CUresult CUDAAPI cuLaunchKernelEx(const CUlaunchConfig *config,
+                                  CUfunction f,
+                                  void **kernelParams,
+                                  void **extra);
+
+/**
+ * \brief Launches a CUDA function ::CUfunction or a CUDA kernel ::CUkernel where thread blocks
+ * can cooperate and synchronize as they execute
+ *
+ * Invokes the function ::CUfunction or the kernel ::CUkernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
+ * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
+ * \p blockDimZ threads.
+ *
+ * \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * The device on which this kernel is invoked must have a non-zero value for
+ * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH.
+ *
+ * The total number of blocks launched cannot exceed the maximum number of blocks per
+ * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+ *
+ * The kernel cannot make use of CUDA dynamic parallelism.
+ *
+ * Kernel parameters must be specified via \p kernelParams.  If \p f
+ * has N parameters, then \p kernelParams needs to be an array of N
+ * pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
+ * must point to a region of memory from which the actual kernel
+ * parameter will be copied.  The number of kernel parameters and their
+ * offsets and sizes do not need to be specified as that information is
+ * retrieved directly from the kernel's image.
+ *
+ * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is
+ * the same as function state set through ::cuLaunchKernel API
+ *
+ * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous
+ * block shape, shared size and parameter info associated with \p f
+ * is overwritten.
+ *
+ * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have
+ * been compiled with toolchain version 3.2 or later so that it will
+ * contain kernel parameter information, or have no kernel parameters.
+ * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will
+ * return ::CUDA_ERROR_INVALID_IMAGE.
+ *
+ * Note that the API can also be used to launch context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to launch
+ * the kernel on will either be taken from the specified stream \p hStream
+ * or the current context in case of NULL stream.
+ *
+ * \param f              - Function ::CUfunction or Kernel ::CUkernel to launch
+ * \param gridDimX       - Width of grid in blocks
+ * \param gridDimY       - Height of grid in blocks
+ * \param gridDimZ       - Depth of grid in blocks
+ * \param blockDimX      - X dimension of each thread block
+ * \param blockDimY      - Y dimension of each thread block
+ * \param blockDimZ      - Z dimension of each thread block
+ * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
+ * \param hStream        - Stream identifier
+ * \param kernelParams   - Array of pointers to kernel parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchCooperativeKernelMultiDevice,
+ * ::cudaLaunchCooperativeKernel,
+ * ::cuLibraryGetKernel,
+ * ::cuKernelSetCacheConfig,
+ * ::cuKernelGetAttribute,
+ * ::cuKernelSetAttribute
+ */
+CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams);
+
+/**
+ * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute
+ *
+ * \deprecated This function is deprecated as of CUDA 11.3.
+ *
+ * Invokes kernels as specified in the \p launchParamsList array where each element
+ * of the array specifies all the parameters required to perform a single kernel launch.
+ * These kernels can cooperate and synchronize as they execute. The size of the array is
+ * specified by \p numDevices.
+ *
+ * No two kernels can be launched on the same device. All the devices targeted by this
+ * multi-device launch must be identical. All devices must have a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH.
+ *
+ * All kernels launched must be identical with respect to the compiled code. Note that
+ * any __device__, __constant__ or __managed__ variables present in the module that owns
+ * the kernel launched on each device, are independently instantiated on every device.
+ * It is the application's responsibility to ensure these variables are initialized and
+ * used appropriately.
+ *
+ * The size of the grids as specified in blocks, the size of the blocks themselves
+ * and the amount of shared memory used by each thread block must also match across
+ * all launched kernels.
+ *
+ * The streams used to launch these kernels must have been created via either ::cuStreamCreate
+ * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD
+ * cannot be used.
+ *
+ * The total number of blocks launched per kernel cannot exceed the maximum number of blocks
+ * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the
+ * total number of blocks launched per device has to match across all devices, the maximum
+ * number of blocks that can be launched per device will be limited by the device with the
+ * least number of multiprocessors.
+ *
+ * The kernels cannot make use of CUDA dynamic parallelism.
+ *
+ * The ::CUDA_LAUNCH_PARAMS structure is defined as:
+ * \code
+        typedef struct CUDA_LAUNCH_PARAMS_st
+        {
+            CUfunction function;
+            unsigned int gridDimX;
+            unsigned int gridDimY;
+            unsigned int gridDimZ;
+            unsigned int blockDimX;
+            unsigned int blockDimY;
+            unsigned int blockDimZ;
+            unsigned int sharedMemBytes;
+            CUstream hStream;
+            void **kernelParams;
+        } CUDA_LAUNCH_PARAMS;
+ * \endcode
+ * where:
+ * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must
+ *   be identical with respect to the compiled code.
+ *   Note that you can also specify context-less kernel ::CUkernel by querying the handle
+ *   using ::cuLibraryGetKernel() and then casting to ::CUfunction. In this case, the context to
+ *   launch the kernel on be taken from the specified stream ::CUDA_LAUNCH_PARAMS::hStream.
+ * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes.
+ *   This must match across all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot
+ *   be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated
+ *   with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function.
+ * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If
+ *   ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams
+ *   needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through
+ *   ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual
+ *   kernel parameter will be copied. The number of kernel parameters and their offsets and sizes
+ *   do not need to be specified as that information is retrieved directly from the kernel's image.
+ *
+ * By default, the kernel won't begin execution on any GPU until all prior work in all the specified
+ * streams has completed. This behavior can be overridden by specifying the flag
+ * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel
+ * will only wait for prior work in the stream corresponding to that GPU to complete before it begins
+ * execution.
+ *
+ * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin
+ * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying
+ * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified,
+ * any subsequent work pushed in any of the specified streams will only wait for the kernel launched
+ * on the GPU corresponding to that stream to complete before it begins execution.
+ *
+ * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is
+ * the same as function state set through ::cuLaunchKernel API when called individually for each
+ * element in \p launchParamsList.
+ *
+ * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous
+ * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function
+ * in \p launchParamsList is overwritten.
+ *
+ * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have
+ * been compiled with toolchain version 3.2 or later so that it will
+ * contain kernel parameter information, or have no kernel parameters.
+ * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will
+ * return ::CUDA_ERROR_INVALID_IMAGE.
+ *
+ * \param launchParamsList - List of launch parameters, one per device
+ * \param numDevices       - Size of the \p launchParamsList array
+ * \param flags            - Flags to control launch behavior
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchCooperativeKernel,
+ * ::cudaLaunchCooperativeKernelMultiDevice
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags);
+
+/**
+ * \brief Enqueues a host function call in a stream
+ *
+ * Enqueues a host function to run in a stream.  The function will be called
+ * after currently enqueued work and will block work added after it.
+ *
+ * The host function must not make any CUDA API calls.  Attempting to use a
+ * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required.
+ * The host function must not perform any synchronization that may depend on
+ * outstanding CUDA work not mandated to run earlier.  Host functions without a
+ * mandated order (such as in independent streams) execute in undefined order
+ * and may be serialized.
+ *
+ * For the purposes of Unified Memory, execution makes a number of guarantees:
+ * <ul>
+ *   <li>The stream is considered idle for the duration of the function's
+ *   execution.  Thus, for example, the function may always use memory attached
+ *   to the stream it was enqueued in.</li>
+ *   <li>The start of execution of the function has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the function.  It thus synchronizes streams which have been "joined"
+ *   prior to the function.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding host functions and stream callbacks
+ *   have executed.  Thus, for
+ *   example, a function might use global attached memory even if work has
+ *   been added to another stream, if the work has been ordered behind the
+ *   function call with an event.</li>
+ *   <li>Completion of the function does not cause a stream to become
+ *   active except as described above.  The stream will remain idle
+ *   if no device work follows the function, and will remain idle across
+ *   consecutive host functions or stream callbacks without device work in
+ *   between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a host function at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * Note that, in contrast to ::cuStreamAddCallback, the function will not be
+ * called in the event of an error in the CUDA context.
+ *
+ * \param hStream  - Stream to enqueue function call in
+ * \param fn       - The function to call once preceding stream operations are complete
+ * \param userData - User-specified data to be passed to the function
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuMemAllocManaged,
+ * ::cuStreamAttachMemAsync,
+ * ::cuStreamAddCallback
+ */
+CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
+
+/** @} */ /* END CUDA_EXEC */
+
+/**
+ * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated execution control functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Sets the block-dimensions for the function
+ *
+ * \deprecated
+ *
+ * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are
+ * created when the kernel given by \p hfunc is launched.
+ *
+ * \param hfunc - Kernel to specify dimensions of
+ * \param x     - X dimension
+ * \param y     - Y dimension
+ * \param z     - Z dimension
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetSharedSize,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSeti,
+ * ::cuParamSetf,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
+
+/**
+ * \brief Sets the dynamic shared-memory size for the function
+ *
+ * \deprecated
+ *
+ * Sets through \p bytes the amount of dynamic shared memory that will be
+ * available to each thread block when the kernel given by \p hfunc is launched.
+ *
+ * \param hfunc - Kernel to specify dynamic shared-memory size for
+ * \param bytes - Dynamic shared-memory size per thread in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSeti,
+ * ::cuParamSetf,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
+
+/**
+ * \brief Sets the parameter size for the function
+ *
+ * \deprecated
+ *
+ * Sets through \p numbytes the total size in bytes needed by the function
+ * parameters of the kernel corresponding to \p hfunc.
+ *
+ * \param hfunc    - Kernel to set parameter size for
+ * \param numbytes - Size of parameter list in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes);
+
+/**
+ * \brief Adds an integer parameter to the function's argument list
+ *
+ * \deprecated
+ *
+ * Sets an integer parameter that will be specified the next time the
+ * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
+ *
+ * \param hfunc  - Kernel to add parameter to
+ * \param offset - Offset to add parameter to argument list
+ * \param value  - Value of parameter
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value);
+
+/**
+ * \brief Adds a floating-point parameter to the function's argument list
+ *
+ * \deprecated
+ *
+ * Sets a floating-point parameter that will be specified the next time the
+ * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
+ *
+ * \param hfunc  - Kernel to add parameter to
+ * \param offset - Offset to add parameter to argument list
+ * \param value  - Value of parameter
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value);
+
+/**
+ * \brief Adds arbitrary data to the function's argument list
+ *
+ * \deprecated
+ *
+ * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr
+ * into the parameter space of the kernel corresponding to \p hfunc. \p offset
+ * is a byte offset.
+ *
+ * \param hfunc    - Kernel to add data to
+ * \param offset   - Offset to add data to argument list
+ * \param ptr      - Pointer to arbitrary data
+ * \param numbytes - Size of data to copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * \deprecated
+ *
+ * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block
+ * contains the number of threads specified by a previous call to
+ * ::cuFuncSetBlockShape().
+ *
+ * The block shape, dynamic shared memory size, and parameter information
+ * must be set using
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(), and
+ *  ::cuParamSetv()
+ * prior to calling this function.
+ *
+ * Launching a function via ::cuLaunchKernel() invalidates the function's
+ * block shape, dynamic shared memory size, and parameter information. After
+ * launching via cuLaunchKernel, this state must be re-initialized prior to
+ * calling this function. Failure to do so results in undefined behavior.
+ *
+ * \param f - Kernel to launch
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * \deprecated
+ *
+ * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
+ * blocks. Each block contains the number of threads specified by a previous
+ * call to ::cuFuncSetBlockShape().
+ *
+ * The block shape, dynamic shared memory size, and parameter information
+ * must be set using
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(), and
+ *  ::cuParamSetv()
+ * prior to calling this function.
+ *
+ * Launching a function via ::cuLaunchKernel() invalidates the function's
+ * block shape, dynamic shared memory size, and parameter information. After
+ * launching via cuLaunchKernel, this state must be re-initialized prior to
+ * calling this function. Failure to do so results in undefined behavior.
+ *
+ * \param f           - Kernel to launch
+ * \param grid_width  - Width of grid in blocks
+ * \param grid_height - Height of grid in blocks
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * \deprecated
+ *
+ * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
+ * blocks. Each block contains the number of threads specified by a previous
+ * call to ::cuFuncSetBlockShape().
+ *
+ * The block shape, dynamic shared memory size, and parameter information
+ * must be set using
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(), and
+ *  ::cuParamSetv()
+ * prior to calling this function.
+ *
+ * Launching a function via ::cuLaunchKernel() invalidates the function's
+ * block shape, dynamic shared memory size, and parameter information. After
+ * launching via cuLaunchKernel, this state must be re-initialized prior to
+ * calling this function. Failure to do so results in undefined behavior.
+ *
+ * \param f           - Kernel to launch
+ * \param grid_width  - Width of grid in blocks
+ * \param grid_height - Height of grid in blocks
+ * \param hStream     - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ *
+ * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no),
+ *       this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by
+ *       growing the per-thread stack as needed per launch and not shrinking it afterwards.
+ *
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
+
+
+/**
+ * \brief Adds a texture-reference to the function's argument list
+ *
+ * \deprecated
+ *
+ * Makes the CUDA array or linear memory bound to the texture reference
+ * \p hTexRef available to a device program as a texture. In this version of
+ * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and
+ * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT.
+ *
+ * \param hfunc   - Kernel to add texture-reference to
+ * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT)
+ * \param hTexRef - Texture-reference to add to argument list
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
+
+/**
+ * \brief Sets the shared memory configuration for a device function.
+ *
+ * \deprecated
+ *
+ * On devices with configurable shared memory banks, this function will
+ * force all subsequent launches of the specified device function to have
+ * the given shared memory bank size configuration. On any given launch of the
+ * function, the shared memory configuration of the device will be temporarily
+ * changed if needed to suit the function's preferred configuration. Changes in
+ * shared memory configuration between subsequent launches of functions,
+ * may introduce a device side synchronization point.
+ *
+ * Any per-function setting of shared memory bank size set via
+ * ::cuFuncSetSharedMemConfig will override the context wide setting set with
+ * ::cuCtxSetSharedMemConfig.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance.
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * The supported bank configurations are:
+ * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory
+ *   configuration when launching this function.
+ * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively four bytes when launching this function.
+ * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively eight bytes when launching this function.
+ *
+ * \param hfunc  - kernel to be given a shared memory config
+ * \param config - requested shared memory configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuCtxSetSharedMemConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchKernel,
+ * ::cudaFuncSetSharedMemConfig
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
+
+/** @} */ /* END CUDA_EXEC_DEPRECATED */
+
+/**
+ * \defgroup CUDA_GRAPH Graph Management
+ *
+ * ___MANBRIEF___ graph management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graph management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a graph
+ *
+ * Creates an empty graph, which is returned via \p phGraph.
+ *
+ * \param phGraph - Returns newly created graph
+ * \param flags   - Graph creation flags, must be 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphInstantiate,
+ * ::cuGraphDestroy,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphClone
+ */
+CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags);
+
+/**
+ * \brief Creates a kernel execution node and adds it to a graph
+ *
+ * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The CUDA_KERNEL_NODE_PARAMS structure is defined as:
+ *
+ * \code
+ *  typedef struct CUDA_KERNEL_NODE_PARAMS_st {
+ *      CUfunction func;
+ *      unsigned int gridDimX;
+ *      unsigned int gridDimY;
+ *      unsigned int gridDimZ;
+ *      unsigned int blockDimX;
+ *      unsigned int blockDimY;
+ *      unsigned int blockDimZ;
+ *      unsigned int sharedMemBytes;
+ *      void **kernelParams;
+ *      void **extra;
+ *      CUkernel kern;
+ *      CUcontext ctx;
+ *  } CUDA_KERNEL_NODE_PARAMS;
+ * \endcode
+ *
+ * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x
+ * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains
+ * (\p blockDimX x \p blockDimY x \p blockDimZ) threads.
+ *
+ * \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * Kernel parameters to \p func can be specified in one of two ways:
+ *
+ * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N
+ * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer,
+ * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual
+ * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need
+ * to be specified as that information is retrieved directly from the kernel's image.
+ *
+ * 2) Kernel parameters for non-cooperative kernels can also be packaged by the application into a single
+ * buffer that is passed in via \p extra. This places the burden on the application of knowing each
+ * kernel parameter's size and alignment/padding within the buffer. The \p extra parameter exists
+ * to allow this function to take additional less commonly used arguments. \p extra specifies
+ * a list of names of extra settings and their corresponding values. Each extra setting name is
+ * immediately followed by the corresponding value. The list must be terminated with either NULL or
+ * CU_LAUNCH_PARAM_END.
+ *
+ * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
+ *   array;
+ * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
+ *   value in \p extra will be a pointer to a buffer
+ *   containing all the kernel parameters for launching kernel
+ *   \p func;
+ * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
+ *   value in \p extra will be a pointer to a size_t
+ *   containing the size of the buffer specified with
+ *   ::CU_LAUNCH_PARAM_BUFFER_POINTER;
+ *
+ * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both
+ * \p kernelParams and \p extra (i.e. both \p kernelParams and \p extra are non-NULL).
+ * ::CUDA_ERROR_INVALID_VALUE will be returned if \p extra is used for a cooperative kernel.
+ *
+ * The \p kernelParams or \p extra array, as well as the argument values it points to,
+ * are copied during this call.
+ *
+ * \note Kernels launched using graphs must not use texture and surface references. Reading or
+ *       writing through any texture or surface reference is undefined behavior.
+ *       This restriction does not apply to texture and surface objects.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the GPU execution node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuLaunchKernel,
+ * ::cuLaunchCooperativeKernel,
+ * ::cuGraphKernelNodeGetParams,
+ * ::cuGraphKernelNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a kernel node's parameters
+ *
+ * Returns the parameters of kernel node \p hNode in \p nodeParams.
+ * The \p kernelParams or \p extra array returned in \p nodeParams,
+ * as well as the argument values it points to, are owned by the node.
+ * This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphKernelNodeSetParams to update the
+ * parameters of this node.
+ *
+ * The params will contain either \p kernelParams or \p extra,
+ * according to which of these was most recently set on the node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphKernelNodeSetParams
+ */
+CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets a kernel node's parameters
+ *
+ * Sets the parameters of kernel node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuLaunchKernel,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphKernelNodeGetParams
+ */
+CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a memcpy node and adds it to a graph
+ *
+ * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * When the graph is launched, the node will perform the memcpy described by \p copyParams.
+ * See ::cuMemcpy3D() for a description of the structure and its restrictions.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer
+ * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed
+ * for those operand(s). The managed memory will be treated as residing on either the
+ * host or the device, depending on which memory type is specified.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param copyParams      - Parameters for the memory copy
+ * \param ctx             - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuMemcpy3D,
+ * ::cuGraphMemcpyNodeGetParams,
+ * ::cuGraphMemcpyNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx);
+
+/**
+ * \brief Returns a memcpy node's parameters
+ *
+ * Returns the parameters of memcpy node \p hNode in \p nodeParams.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemcpy3D,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphMemcpyNodeSetParams
+ */
+CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams);
+
+/**
+ * \brief Sets a memcpy node's parameters
+ *
+ * Sets the parameters of memcpy node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuMemcpy3D,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphMemcpyNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams);
+
+/**
+ * \brief Creates a memset node and adds it to a graph
+ *
+ * Creates a new memset node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The element size must be 1, 2, or 4 bytes.
+ * When the graph is launched, the node will perform the memset described by \p memsetParams.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param memsetParams    - Parameters for the memory set
+ * \param ctx             - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuMemsetD2D32,
+ * ::cuGraphMemsetNodeGetParams,
+ * ::cuGraphMemsetNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode
+ */
+CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx);
+
+/**
+ * \brief Returns a memset node's parameters
+ *
+ * Returns the parameters of memset node \p hNode in \p nodeParams.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemsetD2D32,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphMemsetNodeSetParams
+ */
+CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets a memset node's parameters
+ *
+ * Sets the parameters of memset node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuMemsetD2D32,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphMemsetNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a host execution node and adds it to a graph
+ *
+ * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * When the graph is launched, the node will invoke the specified CPU function.
+ * Host nodes are not supported under MPS with pre-Volta GPUs.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the host node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuLaunchHostFunc,
+ * ::cuGraphHostNodeGetParams,
+ * ::cuGraphHostNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a host node's parameters
+ *
+ * Returns the parameters of host node \p hNode in \p nodeParams.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchHostFunc,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphHostNodeSetParams
+ */
+CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets a host node's parameters
+ *
+ * Sets the parameters of host node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuLaunchHostFunc,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphHostNodeGetParams
+ */
+CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a child graph node and adds it to a graph
+ *
+ * Creates a new node which executes an embedded graph, and adds it to \p hGraph with
+ * \p numDependencies dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * If \p hGraph contains allocation or free nodes, this call will return an error.
+ *
+ * The node executes an embedded child graph. The child graph is cloned in this call.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param childGraph      - The graph to clone into this node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphChildGraphNodeGetGraph,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphClone
+ */
+CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
+
+/**
+ * \brief Gets a handle to the embedded graph of a child graph node
+ *
+ * Gets a handle to the embedded graph in a child graph node. This call
+ * does not clone the graph. Changes to the graph will be reflected in
+ * the node, and the node retains ownership of the graph.
+ *
+ * Allocation and free nodes cannot be added to the returned graph.
+ * Attempting to do so will return an error.
+ *
+ * \param hNode   - Node to get the embedded graph for
+ * \param phGraph - Location to store a handle to the graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphNodeFindInClone
+ */
+CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph);
+
+/**
+ * \brief Creates an empty node and adds it to a graph
+ *
+ * Creates a new node which performs no operation, and adds it to \p hGraph with
+ * \p numDependencies dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * An empty node performs no operation during execution, but can be used for
+ * transitive ordering. For example, a phased execution graph with 2 groups of n
+ * nodes with a barrier between them can be represented using an empty node and
+ * 2*n dependency edges, rather than no empty node and n^2 dependency edges.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
+
+/**
+ * \brief Creates an event record node and adds it to a graph
+ *
+ * Creates a new event record node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and event specified in \p event.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * Each launch of the graph will record \p event to capture execution of the
+ * node's dependencies.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param event           - Event for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+
+/**
+ * \brief Returns the event associated with an event record node
+ *
+ * Returns the event of event record node \p hNode in \p event_out.
+ *
+ * \param hNode     - Node to get the event for
+ * \param event_out - Pointer to return the event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphEventRecordNodeSetEvent,
+ * ::cuGraphEventWaitNodeGetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent *event_out);
+
+/**
+ * \brief Sets an event record node's event
+ *
+ * Sets the event of event record node \p hNode to \p event.
+ *
+ * \param hNode - Node to set the event for
+ * \param event - Event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphEventRecordNodeGetEvent,
+ * ::cuGraphEventWaitNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Creates an event wait node and adds it to a graph
+ *
+ * Creates a new event wait node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and event specified in \p event.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The graph node will wait for all work captured in \p event.  See ::cuEventRecord()
+ * for details on what is captured by an event. \p event may be from a different context
+ * or device than the launch stream.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param event           - Event for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+
+/**
+ * \brief Returns the event associated with an event wait node
+ *
+ * Returns the event of event wait node \p hNode in \p event_out.
+ *
+ * \param hNode     - Node to get the event for
+ * \param event_out - Pointer to return the event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphEventWaitNodeSetEvent,
+ * ::cuGraphEventRecordNodeGetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent *event_out);
+
+/**
+ * \brief Sets an event wait node's event
+ *
+ * Sets the event of event wait node \p hNode to \p event.
+ *
+ * \param hNode - Node to set the event for
+ * \param event - Event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphEventWaitNodeGetEvent,
+ * ::cuGraphEventRecordNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Creates an external semaphore signal node and adds it to a graph
+ *
+ * Creates a new external semaphore signal node and adds it to \p hGraph with \p
+ * numDependencies dependencies specified via \p dependencies and arguments specified
+ * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the
+ * node will be placed at the root of the graph. \p dependencies may not have any
+ * duplicate entries. A handle to the new node will be returned in \p phGraphNode.
+ *
+ * Performs a signal operation on a set of externally allocated semaphore objects
+ * when the node is launched.  The operation(s) will occur after all of the node's
+ * dependencies have completed.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphExternalSemaphoresSignalNodeGetParams,
+ * ::cuGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns an external semaphore signal node's parameters
+ *
+ * Returns the parameters of an external semaphore signal node \p hNode in \p params_out.
+ * The \p extSemArray and \p paramsArray returned in \p params_out,
+ * are owned by the node.  This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out);
+
+/**
+ * \brief Sets an external semaphore signal node's parameters
+ *
+ * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates an external semaphore wait node and adds it to a graph
+ *
+ * Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p phGraphNode.
+ *
+ * Performs a wait operation on a set of externally allocated semaphore objects
+ * when the node is launched.  The node's dependencies will not be launched until
+ * the wait operation has completed.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphExternalSemaphoresWaitNodeGetParams,
+ * ::cuGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns an external semaphore wait node's parameters
+ *
+ * Returns the parameters of an external semaphore wait node \p hNode in \p params_out.
+ * The \p extSemArray and \p paramsArray returned in \p params_out,
+ * are owned by the node.  This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out);
+
+/**
+ * \brief Sets an external semaphore wait node's parameters
+ *
+ * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a batch memory operation node and adds it to a graph
+ *
+ * Creates a new batch memory operation node and adds it to \p hGraph with \p
+ * numDependencies dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * When the node is added, the paramArray inside \p nodeParams is copied and therefore it can be
+ * freed after the call returns.
+ *
+ * \note
+ * Warning:
+ * Improper use of this API may deadlock the application. Synchronization 
+ * ordering established through this API is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by this API should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order. For more 
+ * information, see the Stream Memory Operations section in the programming 
+ * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuStreamBatchMemOp,
+ * ::cuStreamWaitValue32,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamWriteValue64,
+ * ::cuGraphBatchMemOpNodeGetParams,
+ * ::cuGraphBatchMemOpNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddBatchMemOpNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a batch mem op node's parameters
+ *
+ * Returns the parameters of batch mem op node \p hNode in \p nodeParams_out.
+ * The \p paramArray returned in \p nodeParams_out is owned by the node.
+ * This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphBatchMemOpNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode          - Node to get the parameters for
+ * \param nodeParams_out - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamBatchMemOp,
+ * ::cuGraphAddBatchMemOpNode,
+ * ::cuGraphBatchMemOpNodeSetParams
+ */
+CUresult CUDAAPI cuGraphBatchMemOpNodeGetParams(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out);
+
+/**
+ * \brief Sets a batch mem op node's parameters
+ *
+ * Sets the parameters of batch mem op node \p hNode to \p nodeParams.
+ *
+ * The paramArray inside \p nodeParams is copied and therefore it can be
+ * freed after the call returns.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetParams,
+ * ::cuStreamBatchMemOp,
+ * ::cuGraphAddBatchMemOpNode,
+ * ::cuGraphBatchMemOpNodeGetParams
+ */
+CUresult CUDAAPI cuGraphBatchMemOpNodeSetParams(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets the parameters for a batch mem op node in the given graphExec
+ *
+ * Sets the parameters of a batch mem op node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The following fields on operations may be modified on an executable graph:
+ *
+ *  op.waitValue.address
+ *  op.waitValue.value[64]
+ *  op.waitValue.flags bits corresponding to wait type (i.e. CU_STREAM_WAIT_VALUE_FLUSH bit cannot be modified)
+ *  op.writeValue.address
+ *  op.writeValue.value[64]
+ *
+ * Other fields, such as the context, count or type of operations, and other types of operations such as membars, 
+ * may not be modified.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * The paramArray inside \p nodeParams is copied and therefore it can be
+ * freed after the call returns.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Batch mem op node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuStreamBatchMemOp,
+ * ::cuGraphAddBatchMemOpNode,
+ * ::cuGraphBatchMemOpNodeGetParams,
+ * ::cuGraphBatchMemOpNodeSetParams,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecBatchMemOpNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates an allocation node and adds it to a graph
+ *
+ * Creates a new allocation node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p phGraphNode.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * When ::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in
+ * \p nodeParams.dptr.  The allocation's address remains fixed across instantiations and launches.
+ *
+ * If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode,
+ * the allocation can be accessed by nodes ordered after the allocation node but before the free node.
+ * These allocations cannot be freed outside the owning graph, and they can only be freed once in the
+ * owning graph.
+ *
+ * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the
+ * graph which are ordered after the allocation node, but also by stream operations ordered after the
+ * graph's execution but before the allocation is freed.
+ *
+ * Allocations which are not freed in the same graph can be freed by:
+ * - passing the allocation to ::cuMemFreeAsync or ::cuMemFree;
+ * - launching a graph with a free node for that allocation; or
+ * - specifying ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH during instantiation, which makes
+ * each launch behave as though it called ::cuMemFreeAsync for every unfreed allocation.
+ * 
+ * It is not possible to free an allocation in both the owning graph and another graph.  If the allocation
+ * is freed in the same graph, a free node cannot be added to another graph.  If the allocation is freed
+ * in another graph, a free node can no longer be added to the owning graph.
+ *
+ * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ * - Nodes and edges of the graph cannot be deleted.
+ * - The graph cannot be used in a child node.
+ * - Only one instantiation of the graph may exist at any point in time.
+ * - The graph cannot be cloned.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphAddMemFreeNode,
+ * ::cuGraphMemAllocNodeGetParams,
+ * ::cuDeviceGraphMemTrim,
+ * ::cuDeviceGetGraphMemAttribute,
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuMemAllocAsync,
+ * ::cuMemFreeAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddMemAllocNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a memory alloc node's parameters
+ *
+ * Returns the parameters of a memory alloc node \p hNode in \p params_out.
+ * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the
+ * node.  This memory remains valid until the node is destroyed.  The returned
+ * parameters must not be modified.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphMemFreeNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out);
+
+/**
+ * \brief Creates a memory free node and adds it to a graph
+ *
+ * Creates a new memory free node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p phGraphNode.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dptr            - Address of memory to free
+ *
+ * ::cuGraphAddMemFreeNode will return ::CUDA_ERROR_INVALID_VALUE if the user attempts to free:
+ * - an allocation twice in the same graph.
+ * - an address that was not returned by an allocation node.
+ * - an invalid address.
+ *
+ * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ * - Nodes and edges of the graph cannot be deleted.
+ * - The graph cannot be used in a child node.
+ * - Only one instantiation of the graph may exist at any point in time.
+ * - The graph cannot be cloned.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphMemFreeNodeGetParams,
+ * ::cuDeviceGraphMemTrim,
+ * ::cuDeviceGetGraphMemAttribute,
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuMemAllocAsync,
+ * ::cuMemFreeAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
+
+/**
+ * \brief Returns a memory free node's parameters
+ *
+ * Returns the address of a memory free node \p hNode in \p dptr_out.
+ *
+ * \param hNode    - Node to get the parameters for
+ * \param dptr_out - Pointer to return the device address
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemFreeNode,
+ * ::cuGraphMemAllocNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr *dptr_out);
+
+/**
+ * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS.
+ *
+ * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are
+ * freed back to the operating system.
+ *
+ * \param device - The device for which cached memory should be freed.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphAddMemFreeNode,
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuDeviceGetGraphMemAttribute
+ */
+CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device);
+
+/**
+ * \brief Query asynchronous allocation attributes related to graphs
+ *
+ * Valid attributes are:
+ *
+ * - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs
+ * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the
+ *   last time it was reset.  High watermark can only be reset to zero.
+ * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ *
+ * \param device - Specifies the scope of the query
+ * \param attr - attribute to get
+ * \param value - retrieved value
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphAddMemFreeNode
+ */
+CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value);
+
+/**
+ * \brief Set asynchronous allocation attributes related to graphs
+ *
+ * Valid attributes are:
+ *
+ * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the
+ *   last time it was reset.  High watermark can only be reset to zero.
+ * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ *
+ * \param device - Specifies the scope of the query
+ * \param attr - attribute to get
+ * \param value - pointer to value to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa
+ * ::cuDeviceGetGraphMemAttribute,
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphAddMemFreeNode
+ */
+CUresult CUDAAPI cuDeviceSetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value);
+
+/**
+ * \brief Clones a graph
+ *
+ * This function creates a copy of \p originalGraph and returns it in \p phGraphClone.
+ * All parameters are copied into the cloned graph. The original graph may be modified
+ * after this call without affecting the clone.
+ *
+ * Child graph nodes in the original graph are recursively copied into the clone.
+ *
+ * \param phGraphClone  - Returns newly created cloned graph
+ * \param originalGraph - Graph to clone
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphNodeFindInClone
+ */
+CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph);
+
+/**
+ * \brief Finds a cloned version of a node
+ *
+ * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode
+ * in the original graph.
+ *
+ * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone.
+ * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to
+ * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have
+ * been removed. The cloned node is then returned via \p phClonedNode.
+ *
+ * \param phNode  - Returns handle to the cloned node
+ * \param hOriginalNode - Handle to the original node
+ * \param hClonedGraph - Cloned graph to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphClone
+ */
+CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
+
+/**
+ * \brief Returns a node's type
+ *
+ * Returns the node type of \p hNode in \p type.
+ *
+ * \param hNode - Node to query
+ * \param type  - Pointer to return the node type
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphChildGraphNodeGetGraph,
+ * ::cuGraphKernelNodeGetParams,
+ * ::cuGraphKernelNodeSetParams,
+ * ::cuGraphHostNodeGetParams,
+ * ::cuGraphHostNodeSetParams,
+ * ::cuGraphMemcpyNodeGetParams,
+ * ::cuGraphMemcpyNodeSetParams,
+ * ::cuGraphMemsetNodeGetParams,
+ * ::cuGraphMemsetNodeSetParams
+ */
+CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type);
+
+/**
+ * \brief Returns a graph's nodes
+ *
+ * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this
+ * function will return the number of nodes in \p numNodes. Otherwise,
+ * \p numNodes entries will be filled in. If \p numNodes is higher than the actual
+ * number of nodes, the remaining entries in \p nodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numNodes.
+ *
+ * \param hGraph   - Graph to query
+ * \param nodes    - Pointer to return the nodes
+ * \param numNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetType,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
+
+/**
+ * \brief Returns a graph's root nodes
+ *
+ * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this
+ * function will return the number of root nodes in \p numRootNodes. Otherwise,
+ * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual
+ * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numRootNodes.
+ *
+ * \param hGraph       - Graph to query
+ * \param rootNodes    - Pointer to return the root nodes
+ * \param numRootNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetType,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
+
+/**
+ * \brief Returns a graph's dependency edges
+ *
+ * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding
+ * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the
+ * node in \p from[i]. \p from and \p to may both be NULL, in which
+ * case this function only returns the number of edges in \p numEdges. Otherwise,
+ * \p numEdges entries will be filled in. If \p numEdges is higher than the actual
+ * number of edges, the remaining entries in \p from and \p to will be set to NULL, and
+ * the number of edges actually returned will be written to \p numEdges.
+ *
+ * \param hGraph   - Graph to get the edges from
+ * \param from     - Location to return edge endpoints
+ * \param to       - Location to return edge endpoints
+ * \param numEdges - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
+
+/**
+ * \brief Returns a graph's dependency edges (12.3+)
+ *
+ * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding
+ * indices in \p from, \p to and \p edgeData; that is, the node in \p to[i] has a
+ * dependency on the node in \p from[i] with data \p edgeData[i]. \p from and \p to may
+ * both be NULL, in which case this function only returns the number of edges in
+ * \p numEdges. Otherwise, \p numEdges entries will be filled in. If \p numEdges is higher
+ * than the actual number of edges, the remaining entries in \p from and \p to will be
+ * set to NULL, and the number of edges actually returned will be written to \p numEdges.
+ * \p edgeData may alone be NULL, in which case the edges must all have default (zeroed)
+ * edge data. Attempting a lossy query via NULL \p edgeData will result in
+ * ::CUDA_ERROR_LOSSY_QUERY. If \p edgeData is non-NULL then \p from and \p to must be
+ * as well.
+ *
+ * \param hGraph   - Graph to get the edges from
+ * \param from     - Location to return edge endpoints
+ * \param to       - Location to return edge endpoints
+ * \param edgeData - Optional location to return edge data
+ * \param numEdges - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_LOSSY_QUERY,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetEdges_v2(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, CUgraphEdgeData *edgeData, size_t *numEdges);
+
+/**
+ * \brief Returns a node's dependencies
+ *
+ * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this
+ * function will return the number of dependencies in \p numDependencies. Otherwise,
+ * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual
+ * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numDependencies.
+ *
+ * \param hNode           - Node to query
+ * \param dependencies    - Pointer to return the dependencies
+ * \param numDependencies - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetDependentNodes,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies
+ */
+CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
+
+/**
+ * \brief Returns a node's dependencies (12.3+)
+ *
+ * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this
+ * function will return the number of dependencies in \p numDependencies. Otherwise,
+ * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual
+ * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numDependencies.
+ *
+ * Note that if an edge has non-zero (non-default) edge data and \p edgeData is NULL,
+ * this API will return ::CUDA_ERROR_LOSSY_QUERY. If \p edgeData is non-NULL, then
+ * \p dependencies must be as well.
+ *
+ * \param hNode           - Node to query
+ * \param dependencies    - Pointer to return the dependencies
+ * \param edgeData        - Optional array to return edge data for each dependency
+ * \param numDependencies - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_LOSSY_QUERY,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetDependentNodes,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies
+ */
+CUresult CUDAAPI cuGraphNodeGetDependencies_v2(CUgraphNode hNode, CUgraphNode *dependencies, CUgraphEdgeData *edgeData, size_t *numDependencies);
+
+/**
+ * \brief Returns a node's dependent nodes
+ *
+ * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which
+ * case this function will return the number of dependent nodes in \p numDependentNodes.
+ * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is
+ * higher than the actual number of dependent nodes, the remaining entries in
+ * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will
+ * be returned in \p numDependentNodes.
+ *
+ * \param hNode             - Node to query
+ * \param dependentNodes    - Pointer to return the dependent nodes
+ * \param numDependentNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies
+ */
+CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
+
+/**
+ * \brief Returns a node's dependent nodes (12.3+)
+ *
+ * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which
+ * case this function will return the number of dependent nodes in \p numDependentNodes.
+ * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is
+ * higher than the actual number of dependent nodes, the remaining entries in
+ * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will
+ * be returned in \p numDependentNodes.
+ *
+ * Note that if an edge has non-zero (non-default) edge data and \p edgeData is NULL,
+ * this API will return ::CUDA_ERROR_LOSSY_QUERY.  If \p edgeData is non-NULL, then
+ * \p dependentNodes must be as well.
+ *
+ * \param hNode             - Node to query
+ * \param dependentNodes    - Pointer to return the dependent nodes
+ * \param edgeData          - Optional pointer to return edge data for dependent nodes
+ * \param numDependentNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_LOSSY_QUERY,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies
+ */
+CUresult CUDAAPI cuGraphNodeGetDependentNodes_v2(CUgraphNode hNode, CUgraphNode *dependentNodes, CUgraphEdgeData *edgeData, size_t *numDependentNodes);
+
+/**
+ * \brief Adds dependency edges to a graph
+ *
+ * The number of dependencies to be added is defined by \p numDependencies
+ * Elements in \p from and \p to at corresponding indices define a dependency.
+ * Each node in \p from and \p to must belong to \p hGraph.
+ *
+ * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ * Specifying an existing dependency will return an error.
+ *
+ * \param hGraph - Graph to which dependencies are added
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be added
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphRemoveDependencies,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+
+/**
+ * \brief Adds dependency edges to a graph (12.3+)
+ *
+ * The number of dependencies to be added is defined by \p numDependencies
+ * Elements in \p from and \p to at corresponding indices define a dependency.
+ * Each node in \p from and \p to must belong to \p hGraph.
+ *
+ * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ * Specifying an existing dependency will return an error.
+ *
+ * \param hGraph - Graph to which dependencies are added
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param edgeData - Optional array of edge data. If NULL, default (zeroed) edge data is assumed.
+ * \param numDependencies - Number of dependencies to be added
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphRemoveDependencies,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphAddDependencies_v2(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, const CUgraphEdgeData *edgeData, size_t numDependencies);
+
+/**
+ * \brief Removes dependency edges from a graph
+ *
+ * The number of \p dependencies to be removed is defined by \p numDependencies.
+ * Elements in \p from and \p to at corresponding indices define a dependency.
+ * Each node in \p from and \p to must belong to \p hGraph.
+ *
+ * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ * Specifying a non-existing dependency will return an error.
+ *
+ * Dependencies cannot be removed from graphs which contain allocation or free nodes.
+ * Any attempt to do so will return an error.
+ *
+ * \param hGraph - Graph from which to remove dependencies
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be removed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddDependencies,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+
+/**
+ * \brief Removes dependency edges from a graph (12.3+)
+ *
+ * The number of \p dependencies to be removed is defined by \p numDependencies.
+ * Elements in \p from and \p to at corresponding indices define a dependency.
+ * Each node in \p from and \p to must belong to \p hGraph.
+ *
+ * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ * Specifying an edge that does not exist in the graph, with data matching
+ * \p edgeData, results in an error. \p edgeData is nullable, which is equivalent
+ * to passing default (zeroed) data for each edge.
+ *
+ * Dependencies cannot be removed from graphs which contain allocation or free nodes.
+ * Any attempt to do so will return an error.
+ *
+ * \param hGraph - Graph from which to remove dependencies
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param edgeData - Optional array of edge data. If NULL, edge data is assumed to
+ *                   be default (zeroed).
+ * \param numDependencies - Number of dependencies to be removed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddDependencies,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphRemoveDependencies_v2(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, const CUgraphEdgeData *edgeData, size_t numDependencies);
+
+/**
+ * \brief Remove a node from the graph
+ *
+ * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes
+ * on \p hNode and vice versa.
+ *
+ * Nodes which belong to a graph which contains allocation or free nodes cannot be destroyed.
+ * Any attempt to do so will return an error.
+ *
+ * \param hNode  - Node to remove
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode);
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p hGraph as an executable graph. The graph is validated for any
+ * structural constraints or intra-node constraints which were not previously
+ * validated. If instantiation is successful, a handle to the instantiated graph
+ * is returned in \p phGraphExec.
+ *
+ * The \p flags parameter controls the behavior of instantiation and subsequent
+ * graph launches.  Valid flags are:
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a
+ * graph containing memory allocation nodes to automatically free any
+ * unfreed memory allocations before the graph is relaunched.
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH, which configures the graph for launch
+ * from the device. If this flag is passed, the executable graph handle returned can be
+ * used to launch the graph from both the host and device. This flag can only be used
+ * on platforms which support unified addressing. This flag cannot be used in
+ * conjunction with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH.
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, which causes the graph
+ * to use the priorities from the per-node attributes rather than the priority
+ * of the launch stream during execution. Note that priorities are only available
+ * on kernel nodes, and are copied from stream priority during stream capture.
+ *
+ * If \p hGraph contains any allocation or free nodes, there can be at most one
+ * executable graph in existence for that graph at a time. An attempt to instantiate
+ * a second executable graph before destroying the first with ::cuGraphExecDestroy
+ * will result in an error.
+ * The same also applies if \p hGraph contains any device-updatable kernel nodes.
+ *
+ * If \p hGraph contains kernels which call device-side cudaGraphLaunch() from multiple
+ * contexts, this will result in an error.
+ *
+ * Graphs instantiated for launch on the device have additional restrictions which do not
+ * apply to host graphs:
+ *
+ * - The graph's nodes must reside on a single context.
+ * - The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.
+ * - The graph cannot be empty and must contain at least one kernel, memcpy, or memset node.
+ *   Operation-specific restrictions are outlined below.
+ * - Kernel nodes:
+ *   - Use of CUDA Dynamic Parallelism is not permitted.
+ *   - Cooperative launches are permitted as long as MPS is not in use.
+ * - Memcpy nodes:
+ *   - Only copies involving device memory and/or pinned device-mapped host memory are permitted.
+ *   - Copies involving CUDA arrays are not permitted.
+ *   - Both operands must be accessible from the current context, and the current context must
+ *     match the context of other nodes in the graph.
+ *
+ * \param phGraphExec - Returns instantiated graph
+ * \param hGraph      - Graph to instantiate
+ * \param flags       - Flags to control instantiation.  See ::CUgraphInstantiate_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphCreate,
+ * ::cuGraphUpload,
+ * ::cuGraphLaunch,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags);
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p hGraph as an executable graph according to the \p instantiateParams structure.
+ * The graph is validated for any structural constraints or intra-node constraints
+ * which were not previously validated. If instantiation is successful, a handle to
+ * the instantiated graph is returned in \p phGraphExec.
+ *
+ * \p instantiateParams controls the behavior of instantiation and subsequent
+ * graph launches, as well as returning more detailed information in the event of an error.
+ * ::CUDA_GRAPH_INSTANTIATE_PARAMS is defined as:
+ *
+ * \code
+    typedef struct {
+        cuuint64_t flags;
+        CUstream hUploadStream;
+        CUgraphNode hErrNode_out;
+        CUgraphInstantiateResult result_out;
+    } CUDA_GRAPH_INSTANTIATE_PARAMS;
+ * \endcode
+ *
+ * The \p flags field controls the behavior of instantiation and subsequent
+ * graph launches. Valid flags are:
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a
+ * graph containing memory allocation nodes to automatically free any
+ * unfreed memory allocations before the graph is relaunched.
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD, which will perform an upload of the graph
+ * into \p hUploadStream once the graph has been instantiated.
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH, which configures the graph for launch
+ * from the device. If this flag is passed, the executable graph handle returned can be
+ * used to launch the graph from both the host and device. This flag can only be used
+ * on platforms which support unified addressing. This flag cannot be used in
+ * conjunction with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH.
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, which causes the graph
+ * to use the priorities from the per-node attributes rather than the priority
+ * of the launch stream during execution. Note that priorities are only available
+ * on kernel nodes, and are copied from stream priority during stream capture.
+ *
+ * If \p hGraph contains any allocation or free nodes, there can be at most one
+ * executable graph in existence for that graph at a time. An attempt to instantiate a
+ * second executable graph before destroying the first with ::cuGraphExecDestroy will
+ * result in an error.
+ * The same also applies if \p hGraph contains any device-updatable kernel nodes.
+ *
+ * If \p hGraph contains kernels which call device-side cudaGraphLaunch() from multiple
+ * contexts, this will result in an error.
+ *
+ * Graphs instantiated for launch on the device have additional restrictions which do not
+ * apply to host graphs:
+ *
+ * - The graph's nodes must reside on a single context.
+ * - The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes.
+ * - The graph cannot be empty and must contain at least one kernel, memcpy, or memset node.
+ *   Operation-specific restrictions are outlined below.
+ * - Kernel nodes:
+ *   - Use of CUDA Dynamic Parallelism is not permitted.
+ *   - Cooperative launches are permitted as long as MPS is not in use.
+ * - Memcpy nodes:
+ *   - Only copies involving device memory and/or pinned device-mapped host memory are permitted.
+ *   - Copies involving CUDA arrays are not permitted.
+ *   - Both operands must be accessible from the current context, and the current context must
+ *     match the context of other nodes in the graph.
+ *
+ * In the event of an error, the \p result_out and \p hErrNode_out fields will contain more
+ * information about the nature of the error. Possible error reporting includes:
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_ERROR, if passed an invalid value or if an unexpected error occurred
+ *   which is described by the return value of the function. \p hErrNode_out will be set to NULL.
+ * - ::CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE, if the graph structure is invalid. \p hErrNode_out
+ *   will be set to one of the offending nodes.
+ * - ::CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED, if the graph is instantiated for device
+ *   launch but contains a node of an unsupported node type, or a node which performs unsupported
+ *   operations, such as use of CUDA dynamic parallelism within a kernel node. \p hErrNode_out will
+ *   be set to this node.
+ * - ::CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED, if the graph is instantiated for device
+ *   launch but a node’s context differs from that of another node. This error can also be returned
+ *   if a graph is not instantiated for device launch and it contains kernels which call device-side
+ *   cudaGraphLaunch() from multiple contexts. \p hErrNode_out will be set to this node.
+ *
+ * If instantiation is successful, \p result_out will be set to ::CUDA_GRAPH_INSTANTIATE_SUCCESS,
+ * and \p hErrNode_out will be set to NULL.
+ *
+ * \param phGraphExec       - Returns instantiated graph
+ * \param hGraph            - Graph to instantiate
+ * \param instantiateParams - Instantiation parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphInstantiate,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphInstantiateWithParams(CUgraphExec *phGraphExec, CUgraph hGraph, CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams);
+
+/**
+ * \brief Query the instantiation flags of an executable graph
+ *
+ * Returns the flags that were passed to instantiation for the given executable graph.
+ * ::CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD will not be returned by this API as it does
+ * not affect the resulting executable graph.
+ *
+ * \param hGraphExec - The executable graph to query
+ * \param flags      - Returns the instantiation flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphInstantiateWithParams
+ */
+CUresult CUDAAPI cuGraphExecGetFlags(CUgraphExec hGraphExec, cuuint64_t *flags);
+
+/**
+ * \brief Sets the parameters for a kernel node in the given graphExec
+ *
+ * Sets the parameters of a kernel node in an executable graph \p hGraphExec. 
+ * The node is identified by the corresponding node \p hNode in the 
+ * non-executable graph, from which the executable graph was instantiated. 
+ *
+ * \p hNode must not have been removed from the original graph. All \p nodeParams 
+ * fields may change, but the following restrictions apply to \p func updates: 
+ *
+ *   - The owning context of the function cannot change.
+ *   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
+ *     to a function which uses CDP
+ *   - A node whose function originally did not make device-side update calls cannot be updated
+ *     to a function which makes device-side update calls.
+ *   - If \p hGraphExec was not instantiated for device launch, a node whose function originally
+ *     did not use device-side cudaGraphLaunch() cannot be updated to a function which uses
+ *     device-side cudaGraphLaunch() unless the node resides on the same context as nodes which
+ *     contained such calls at instantiate-time. If no such calls were present at instantiation,
+ *     these updates cannot be performed at all.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already 
+ * enqueued or running launches of \p hGraphExec are not affected by this call. 
+ * \p hNode is also not modified by this call.
+ *
+ * If \p hNode is a device-updatable kernel node, the next upload/launch of \p hGraphExec
+ * will overwrite any previous device-side updates. Additionally, applying host updates to a
+ * device-updatable kernel node while it is being updated from the device will result in
+ * undefined behavior.
+ * 
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param hNode       - kernel node from the graph from which graphExec was instantiated
+ * \param nodeParams  - Updated Parameters to set
+ * 
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
+ * contained \p copyParams at instantiation.  hNode must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
+ *
+ * The source and destination memory in \p copyParams must be allocated from the same 
+ * contexts as the original source and destination memory.  Both the instantiation-time 
+ * memory operands and the memory operands in \p copyParams must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
+ * not modified by this call.
+ *
+ * Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or
+ * either the original or new memory operands are multidimensional.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Memcpy node from the graph which was used to instantiate graphExec
+ * \param copyParams - The updated parameters to set
+ * \param ctx        - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphMemcpyNodeSetParams,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx);
+
+/**
+ * \brief Sets the parameters for a memset node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
+ * contained \p memsetParams at instantiation.  hNode must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
+ *
+ * Zero sized operations are not supported.
+ *
+ * The new destination pointer in memsetParams must be to the same kind of allocation
+ * as the original destination pointer and have the same context association and device mapping
+ * as the original destination pointer.
+ *
+ * Both the value and pointer address may be updated.  
+ * Changing other aspects of the memset (width, height, element size or pitch) may cause the update to be rejected.
+ * Specifically, for 2d memsets, all dimension changes are rejected.
+ * For 1d memsets, changes in height are explicitly rejected and other changes are oportunistically allowed
+ * if the resulting work maps onto the work resources already allocated for the node.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
+ * not modified by this call.
+ *
+ * \param hGraphExec   - The executable graph in which to set the specified node
+ * \param hNode        - Memset node from the graph which was used to instantiate graphExec
+ * \param memsetParams - The updated parameters to set
+ * \param ctx          - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphMemsetNodeSetParams,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx);
+
+/**
+ * \brief Sets the parameters for a host node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
+ * contained \p nodeParams at instantiation.  hNode must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
+ * not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Host node from the graph which was used to instantiate graphExec
+ * \param nodeParams - The updated parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphHostNodeSetParams,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Updates node parameters in the child graph node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained
+ * in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation.
+ * \p hNode must remain in the graph which was used to instantiate \p hGraphExec.
+ * Changed edges to and from \p hNode are ignored.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p hNode is also 
+ * not modified by this call.
+ *
+ * The topology of \p childGraph, as well as the node insertion order,  must match that
+ * of the graph contained in \p hNode.  See ::cuGraphExecUpdate() for a list of restrictions
+ * on what can be updated in an instantiated graph.  The update is recursive, so child graph
+ * nodes contained within the top level child graph will also be updated.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Host node from the graph which was used to instantiate graphExec
+ * \param childGraph - The graph supplying the updated parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphChildGraphNodeGetGraph,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph);
+
+/**
+ * \brief Sets the event for an event record node in the given graphExec
+ *
+ * Sets the event of an event record node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - event record node from the graph from which graphExec was instantiated
+ * \param event      - Updated event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphEventRecordNodeGetEvent,
+ * ::cuGraphEventWaitNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Sets the event for an event wait node in the given graphExec
+ *
+ * Sets the event of an event wait node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - event wait node from the graph from which graphExec was instantiated
+ * \param event      - Updated event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphEventWaitNodeGetEvent,
+ * ::cuGraphEventRecordNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Sets the parameters for an external semaphore signal node in the given graphExec
+ *
+ * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Changing \p nodeParams->numExtSems is not supported.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - semaphore signal node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets the parameters for an external semaphore wait node in the given graphExec
+ *
+ * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Changing \p nodeParams->numExtSems is not supported.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - semaphore wait node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExecNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Enables or disables the specified node in the given graphExec
+ *
+ * Sets \p hNode to be either enabled or disabled. Disabled nodes are functionally equivalent 
+ * to empty nodes until they are reenabled. Existing node parameters are not affected by 
+ * disabling/enabling the node.
+ *  
+ * The node is identified by the corresponding node \p hNode in the non-executable 
+ * graph, from which the executable graph was instantiated.   
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * If \p hNode is a device-updatable kernel node, the next upload/launch of \p hGraphExec
+ * will overwrite any previous device-side updates. Additionally, applying host updates to a
+ * device-updatable kernel node while it is being updated from the device will result in
+ * undefined behavior.
+ *
+ * \note Currently only kernel, memset and memcpy nodes are supported. 
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Node from the graph from which graphExec was instantiated
+ * \param isEnabled  - Node is enabled if != 0, otherwise the node is disabled
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetEnabled,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ * ::cuGraphLaunch
+ */
+CUresult CUDAAPI cuGraphNodeSetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled);
+
+/**
+ * \brief Query whether a node in the given graphExec is enabled
+ *
+ * Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled.
+ *
+ * The node is identified by the corresponding node \p hNode in the non-executable 
+ * graph, from which the executable graph was instantiated.   
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * \note Currently only kernel, memset and memcpy nodes are supported. 
+ * \note This function will not reflect device-side updates for device-updatable kernel nodes.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Node from the graph from which graphExec was instantiated
+ * \param isEnabled  - Location to return the enabled status of the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetEnabled,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ * ::cuGraphLaunch
+ */
+CUresult CUDAAPI cuGraphNodeGetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled);
+
+/**
+ * \brief Uploads an executable graph in a stream
+ *
+ * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of
+ * the same \p hGraphExec will be serialized. Each upload is ordered behind both any
+ * previous work in \p hStream and any previous launches of \p hGraphExec.
+ * Uses memory cached by \p stream to back the allocations owned by \p hGraphExec.
+ *
+ * \param hGraphExec - Executable graph to upload
+ * \param hStream    - Stream in which to upload the graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphLaunch,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream);
+
+/**
+ * \brief Launches an executable graph in a stream
+ *
+ * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing
+ * at a time. Each launch is ordered behind both any previous work in \p hStream
+ * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be
+ * instantiated multiple times into multiple executable graphs.
+ *
+ * If any allocations created by \p hGraphExec remain unfreed (from a previous launch) and
+ * \p hGraphExec was not instantiated with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH,
+ * the launch will fail with ::CUDA_ERROR_INVALID_VALUE.
+ *
+ * \param hGraphExec - Executable graph to launch
+ * \param hStream    - Stream in which to launch the graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphUpload,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream);
+
+/**
+ * \brief Destroys an executable graph
+ *
+ * Destroys the executable graph specified by \p hGraphExec, as well
+ * as all of its executable nodes. If the executable graph is
+ * in-flight, it will not be terminated, but rather freed
+ * asynchronously on completion.
+ *
+ * \param hGraphExec - Executable graph to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphUpload,
+ * ::cuGraphLaunch
+ */
+CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec);
+
+/**
+ * \brief Destroys a graph
+ *
+ * Destroys the graph specified by \p hGraph, as well as all of its nodes.
+ *
+ * \param hGraph - Graph to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph);
+
+/**
+ * \brief Check whether an executable graph can be updated with a graph and perform the update if possible
+ *
+ * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the
+ * node parameters in a topologically identical graph specified by \p hGraph.
+ *
+ * Limitations:
+ *
+ * - Kernel nodes:
+ *   - The owning context of the function cannot change.
+ *   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
+ *     to a function which uses CDP.
+ *   - A node whose function originally did not make device-side update calls cannot be updated
+ *     to a function which makes device-side update calls.
+ *   - A cooperative node cannot be updated to a non-cooperative node, and vice-versa.
+ *   - If the graph was instantiated with CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, the
+ *     priority attribute cannot change. Equality is checked on the originally requested
+ *     priority values, before they are clamped to the device's supported range.
+ *   - If \p hGraphExec was not instantiated for device launch, a node whose function originally
+ *     did not use device-side cudaGraphLaunch() cannot be updated to a function which uses
+ *     device-side cudaGraphLaunch() unless the node resides on the same context as nodes which
+ *     contained such calls at instantiate-time. If no such calls were present at instantiation,
+ *     these updates cannot be performed at all.
+ *   - Neither \p hGraph nor \p hGraphExec may contain device-updatable kernel nodes.
+ * - Memset and memcpy nodes:
+ *   - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
+ *   - The source/destination memory must be allocated from the same contexts as the original
+ *     source/destination memory.
+ *   - For 2d memsets, only address and assinged value may be updated.
+ *   - For 1d memsets, updating dimensions is also allowed, but may fail if the resulting operation doesn't
+ *     map onto the work resources already allocated for the node. 
+ * - Additional memcpy node restrictions:
+ *   - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE,
+ *     CU_MEMORYTYPE_ARRAY, etc.) is not supported.
+ * - External semaphore wait nodes and record nodes:
+ *   - Changing the number of semaphores is not supported.
+ * - Conditional nodes:
+ *   - Changing node parameters is not supported.
+ *   - Changeing parameters of nodes within the conditional body graph is subject to the rules above.
+ *   - Conditional handle flags and default values are updated as part of the graph update.
+ *
+ * Note:  The API may add further restrictions in future releases.  The return code should always be checked.
+ *
+ * cuGraphExecUpdate sets the result member of \p resultInfo to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED
+ * under the following conditions:
+ * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case resultInfo->errorNode
+ *   is set to NULL.
+ * - \p hGraph has more exit nodes than \p hGraph, in which case resultInfo->errorNode is set to one of
+ *   the exit nodes in hGraph. 
+ * - A node in \p hGraph has a different number of dependencies than the node from \p hGraphExec it is paired with,
+ *   in which case resultInfo->errorNode is set to the node from \p hGraph.
+ * - A node in \p hGraph has a dependency that does not match with the corresponding dependency of the paired node
+ *   from \p hGraphExec. resultInfo->errorNode will be set to the node from \p hGraph. resultInfo->errorFromNode
+ *   will be set to the mismatched dependency. The dependencies are paired based on edge order and a dependency
+ *   does not match when the nodes are already paired based on other edges examined in the graph.
+ *
+ * cuGraphExecUpdate sets the result member of \p resultInfo to: 
+ * - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value.
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case
+ *   \p hErrorNode_out is set to the node from \p hGraph.
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported
+ *   way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way 
+ *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph.
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED if any attributes of a node changed in a way
+ *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph.
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like 
+ *   the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph
+ *
+ * If the update fails for a reason not listed above, the result member of \p resultInfo will be set
+ * to CU_GRAPH_EXEC_UPDATE_ERROR. If the update succeeds, the result member will be set to CU_GRAPH_EXEC_UPDATE_SUCCESS.
+ *
+ * cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully.  It returns
+ * CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included 
+ * changes which violated constraints specific to instantiated graph update.
+ *
+ * \param hGraphExec The instantiated graph to be updated
+ * \param hGraph The graph containing the updated parameters
+ * \param resultInfo the error info structure 
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphExecUpdateResultInfo *resultInfo);
+
+/**
+ * \brief Copies attributes from source node to destination node.
+ *
+ * Copies attributes from source node \p src to destination node \p dst.
+ * Both node must have the same context.
+ *
+ * \param[out] dst Destination node
+ * \param[in] src Source node
+ * For list of attributes see ::CUkernelNodeAttrID
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src);
+
+/**
+ * \brief Queries node attribute.
+ * 
+ * Queries attribute \p attr from node \p hNode and stores it in corresponding
+ * member of \p value_out.
+ *
+ * \param[in] hNode
+ * \param[in] attr
+ * \param[out] value_out 
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *  
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                                      CUkernelNodeAttrValue *value_out);
+ 
+/**
+ * \brief Sets node attribute.
+ * 
+ * Sets attribute \p attr on node \p hNode from corresponding attribute of
+ * \p value.
+ *
+ * \param[out] hNode
+ * \param[in] attr
+ * \param[out] value
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                                      const CUkernelNodeAttrValue *value);
+
+/**
+ * \brief Write a DOT file describing graph structure
+ *
+ * Using the provided \p hGraph, write to \p path a DOT formatted description of the graph.
+ * By default this includes the graph topology, node types, node id, kernel names and memcpy direction.
+ * \p flags can be specified to write more detailed information about each node type such as
+ * parameter values, kernel attributes, node and function handles.
+ *
+ * \param hGraph - The graph to create a DOT file from
+ * \param path   - The path to write the DOT file to
+ * \param flags  - Flags from CUgraphDebugDot_flags for specifying which additional node information to write
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ */
+CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path, unsigned int flags);
+
+/**
+ * \brief Create a user object
+ *
+ * Create a user object with the specified destructor callback and initial reference count. The
+ * initial references are owned by the caller.
+ *
+ * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they
+ * are executed by a shared internal thread. Another thread may be signaled to perform such
+ * actions, if it does not block forward progress of tasks scheduled through CUDA.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object_out      - Location to return the user object handle
+ * \param ptr             - The pointer to pass to the destroy function
+ * \param destroy         - Callback to free the user object when it is no longer in use
+ * \param initialRefcount - The initial refcount to create the object with, typically 1. The
+ *                          initial references are owned by the calling thread.
+ * \param flags           - Currently it is required to pass ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC,
+ *                          which is the only defined flag. This indicates that the destroy
+ *                          callback cannot be waited on by any CUDA API. Users requiring
+ *                          synchronization of the callback should signal its completion
+ *                          manually.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectRetain,
+ * ::cuUserObjectRelease,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr, CUhostFn destroy,
+                                    unsigned int initialRefcount, unsigned int flags);
+
+/**
+ * \brief Retain a reference to a user object
+ *
+ * Retains new references to a user object. The new references are owned by the caller.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object - The object to retain
+ * \param count  - The number of references to retain, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRelease,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count);
+
+/**
+ * \brief Release a reference to a user object
+ *
+ * Releases user object references owned by the caller. The object's destructor is invoked if
+ * the reference count reaches zero.
+ *
+ * It is undefined behavior to release references not owned by the caller, or to use a user
+ * object handle after all references are released.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object - The object to release
+ * \param count  - The number of references to release, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRetain,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count);
+
+/**
+ * \brief Retain a reference to a user object from a graph
+ *
+ * Creates or moves user object references that will be owned by a CUDA graph.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param graph  - The graph to associate the reference with
+ * \param object - The user object to retain a reference for
+ * \param count  - The number of references to add to the graph, typically 1. Must be
+ *                 nonzero and not larger than INT_MAX.
+ * \param flags  - The optional flag ::CU_GRAPH_USER_OBJECT_MOVE transfers references
+ *                 from the calling thread, rather than create new references. Pass 0
+ *                 to create new references.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRetain,
+ * ::cuUserObjectRelease,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags);
+
+/**
+ * \brief Release a user object reference from a graph
+ *
+ * Releases user object references owned by a graph.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param graph  - The graph that will release the reference
+ * \param object - The user object to release a reference for
+ * \param count  - The number of references to release, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRetain,
+ * ::cuUserObjectRelease,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count);
+
+/**
+ * \brief Adds a node of arbitrary type to a graph
+ *
+ * Creates a new node in \p hGraph described by \p nodeParams with \p numDependencies
+ * dependencies specified via \p dependencies. \p numDependencies may be 0.
+ * \p dependencies may be null if \p numDependencies is 0. \p dependencies may not have
+ * any duplicate entries.
+ *
+ * \p nodeParams is a tagged union. The node type should be specified in the \p type field,
+ * and type-specific parameters in the corresponding union member. All unused bytes - that
+ * is, \p reserved0 and all bytes past the utilized union member - must be set to zero.
+ * It is recommended to use brace initialization or memset to ensure all bytes are
+ * initialized.
+ *
+ * Note that for some node types, \p nodeParams may contain "out parameters" which are
+ * modified during the call, such as \p nodeParams->alloc.dptr.
+ *
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Specification of the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphNodeSetParams,
+ * ::cuGraphExecNodeSetParams
+ */
+CUresult CUDAAPI cuGraphAddNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraphNodeParams *nodeParams);
+
+/**
+ * \brief Adds a node of arbitrary type to a graph (12.3+)
+ *
+ * Creates a new node in \p hGraph described by \p nodeParams with \p numDependencies
+ * dependencies specified via \p dependencies. \p numDependencies may be 0.
+ * \p dependencies may be null if \p numDependencies is 0. \p dependencies may not have
+ * any duplicate entries.
+ *
+ * \p nodeParams is a tagged union. The node type should be specified in the \p type field,
+ * and type-specific parameters in the corresponding union member. All unused bytes - that
+ * is, \p reserved0 and all bytes past the utilized union member - must be set to zero.
+ * It is recommended to use brace initialization or memset to ensure all bytes are
+ * initialized.
+ *
+ * Note that for some node types, \p nodeParams may contain "out parameters" which are
+ * modified during the call, such as \p nodeParams->alloc.dptr.
+ *
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param dependencyData  - Optional edge data for the dependencies. If NULL, the data is
+ *                          assumed to be default (zeroed) for all dependencies.
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Specification of the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphNodeSetParams,
+ * ::cuGraphExecNodeSetParams
+ */
+CUresult CUDAAPI cuGraphAddNode_v2(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUgraphNodeParams *nodeParams);
+
+/**
+ * \brief Update's a graph node's parameters
+ *
+ * Sets the parameters of graph node \p hNode to \p nodeParams. The node type specified by
+ * \p nodeParams->type must match the type of \p hNode. \p nodeParams must be fully
+ * initialized and all unused bytes (reserved, padding) zeroed.
+ *
+ * Modifying parameters is not supported for node types CU_GRAPH_NODE_TYPE_MEM_ALLOC and
+ * CU_GRAPH_NODE_TYPE_MEM_FREE.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphExecNodeSetParams
+ */
+CUresult CUDAAPI cuGraphNodeSetParams(CUgraphNode hNode, CUgraphNodeParams *nodeParams);
+
+/**
+ * \brief Update's a graph node's parameters in an instantiated graph
+ *
+ * Sets the parameters of a node in an executable graph \p hGraphExec. The node is identified
+ * by the corresponding node \p hNode in the non-executable graph from which the executable
+ * graph was instantiated. \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Allowed changes to parameters on executable graphs are as follows:
+ * <table>
+ *   <tr><th>Node type<th>Allowed changes
+ *   <tr><td>kernel<td>See ::cuGraphExecKernelNodeSetParams
+ *   <tr><td>memcpy<td>Addresses for 1-dimensional copies if allocated in same context; see ::cuGraphExecMemcpyNodeSetParams
+ *   <tr><td>memset<td>Addresses for 1-dimensional memsets if allocated in same context; see ::cuGraphExecMemsetNodeSetParams
+ *   <tr><td>host<td>Unrestricted
+ *   <tr><td>child graph<td>Topology must match and restrictions apply recursively; see ::cuGraphExecUpdate
+ *   <tr><td>event wait<td>Unrestricted
+ *   <tr><td>event record<td>Unrestricted
+ *   <tr><td>external semaphore signal<td>Number of semaphore operations cannot change
+ *   <tr><td>external semaphore wait<td>Number of semaphore operations cannot change
+ *   <tr><td>memory allocation<td>API unsupported
+ *   <tr><td>memory free<td>API unsupported
+ *   <tr><td>batch memops<td>Addresses, values, and operation type for wait operations; see ::cuGraphExecBatchMemOpNodeSetParams
+ * </table>
+ *
+ * \param hGraphExec  - The executable graph in which to update the specified node
+ * \param hNode       - Corresponding node from the graph from which graphExec was instantiated
+ * \param nodeParams  - Updated Parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode,
+ * ::cuGraphNodeSetParams
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraphNodeParams *nodeParams);
+
+/**
+ * \brief Create a conditional handle
+ *
+ * Creates a conditional handle associated with \p hGraph. 
+ *  
+ * The conditional handle must be associated with a conditional node in this graph or one of its children.
+ *  
+ * Handles not associated with a conditional node may cause graph instantiation to fail. 
+ *  
+ * Handles can only be set from the context with which they are associated. 
+ *
+ * \param pHandle_out        - Pointer used to return the handle to the caller.
+ * \param hGraph             - Graph which will contain the conditional node using this handle.
+ * \param ctx                - Context for the handle and associated conditional node.
+ * \param defaultLaunchValue - Optional initial value for the conditional variable.
+ *                             Applied at the beginning of each graph execution if CU_GRAPH_COND_ASSIGN_DEFAULT is set in \p flags.
+ * \param flags              - Currently must be CU_GRAPH_COND_ASSIGN_DEFAULT or 0.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddNode
+ */
+CUresult CUDAAPI cuGraphConditionalHandleCreate(CUgraphConditionalHandle *pHandle_out, CUgraph hGraph, CUcontext ctx, unsigned int defaultLaunchValue, unsigned int flags);
+
+/** @} */ /* END CUDA_GRAPH */
+
+/**
+ * \defgroup CUDA_OCCUPANCY Occupancy
+ *
+ * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the occupancy calculation functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns occupancy of a function
+ *
+ * Returns in \p *numBlocks the number of the maximum active blocks per
+ * streaming multiprocessor.
+ *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will be the current context.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ */
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
+
+/**
+ * \brief Returns occupancy of a function
+ *
+ * Returns in \p *numBlocks the number of the maximum active blocks per
+ * streaming multiprocessor.
+ *
+ * The \p Flags parameter controls how special cases are handled. The
+ * valid flags are:
+ *
+ * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
+ *   ::cuOccupancyMaxActiveBlocksPerMultiprocessor;
+ *
+ * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
+ *   default behavior on platform where global caching affects
+ *   occupancy. On such platforms, if caching is enabled, but
+ *   per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching
+ *   is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes
+ *   the occupancy calculator to return 0 in such cases. More information
+ *   can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * Note that the API can also be with launch context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will be the current context.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param flags           - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ */
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+
+/**
+ * \brief Suggest a launch configuration with reasonable occupancy
+ *
+ * Returns in \p *blockSize a reasonable block size that can achieve
+ * the maximum occupancy (or, the maximum number of active warps with
+ * the fewest blocks per multiprocessor), and in \p *minGridSize the
+ * minimum grid size to achieve the maximum occupancy.
+ *
+ * If \p blockSizeLimit is 0, the configurator will use the maximum
+ * block size permitted by the device / function instead.
+ *
+ * If per-block dynamic shared memory allocation is not needed, the
+ * user should leave both \p blockSizeToDynamicSMemSize and \p
+ * dynamicSMemSize as 0.
+ *
+ * If per-block dynamic shared memory allocation is needed, then if
+ * the dynamic shared memory size is constant regardless of block
+ * size, the size should be passed through \p dynamicSMemSize, and \p
+ * blockSizeToDynamicSMemSize should be NULL.
+ *
+ * Otherwise, if the per-block dynamic shared memory size varies with
+ * different block sizes, the user needs to provide a unary function
+ * through \p blockSizeToDynamicSMemSize that computes the dynamic
+ * shared memory needed by \p func for any given block size. \p
+ * dynamicSMemSize is ignored. An example signature is:
+ *
+ * \code
+ *    // Take block size, returns dynamic shared memory needed
+ *    size_t blockToSmem(int blockSize);
+ * \endcode
+ *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will be the current context.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
+ * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
+ * \param func        - Kernel for which launch configuration is calculated
+ * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
+ * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxPotentialBlockSize
+ */
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
+
+/**
+ * \brief Suggest a launch configuration with reasonable occupancy
+ *
+ * An extended version of ::cuOccupancyMaxPotentialBlockSize. In
+ * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize,
+ * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags
+ * parameter.
+ *
+ * The \p Flags parameter controls how special cases are handled. The
+ * valid flags are:
+ *
+ * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
+ *   ::cuOccupancyMaxPotentialBlockSize;
+ *
+ * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
+ *   default behavior on platform where global caching affects
+ *   occupancy. On such platforms, the launch configurations that
+ *   produces maximal occupancy might not support global
+ *   caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE
+ *   guarantees that the the produced launch configuration is global
+ *   caching compatible at a potential cost of occupancy. More information
+ *   can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will be the current context.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
+ * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
+ * \param func        - Kernel for which launch configuration is calculated
+ * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
+ * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to handle
+ * \param flags       - Options
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ */
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
+
+/**
+ * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM 
+ *
+ * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. 
+ *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will be the current context.
+ *
+ * \param dynamicSmemSize - Returned maximum dynamic shared memory 
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param numBlocks       - Number of blocks to fit on SM 
+ * \param blockSize       - Size of the blocks
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ */
+CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum cluster size in \p *clusterSize.
+ *
+ * The cluster dimensions in \p config are ignored. If func has a required
+ * cluster size set (see ::cudaFuncGetAttributes / ::cuFuncGetAttribute),\p
+ * *clusterSize will reflect the required cluster size.
+ *
+ * By default this function will always return a value that's portable on
+ * future hardware. A higher value may be returned if the kernel function
+ * allows non-portable cluster sizes.
+ *
+ * This function will respect the compile time launch bounds.
+ *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will either be taken from the specified stream \p config->hStream
+ * or the current context in case of NULL stream.
+ *
+ * \param clusterSize - Returned maximum cluster size that can be launched
+ *                      for the given kernel function and launch configuration
+ * \param func        - Kernel function for which maximum cluster
+ *                      size is calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaFuncGetAttributes,
+ * ::cuFuncGetAttribute
+ */
+CUresult CUDAAPI cuOccupancyMaxPotentialClusterSize(int *clusterSize, CUfunction func, const CUlaunchConfig *config);
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum number of clusters that could co-exist
+ * on the target device in \p *numClusters.
+ *
+ * If the function has required cluster size already set (see
+ * ::cudaFuncGetAttributes / ::cuFuncGetAttribute), the cluster size
+ * from config must either be unspecified or match the required size.
+ * Without required sizes, the cluster size must be specified in config,
+ * else the function will return an error.
+ *
+ * Note that various attributes of the kernel function may affect occupancy
+ * calculation. Runtime environment may affect how the hardware schedules
+ * the clusters, so the calculated occupancy is not guaranteed to be achievable.
+ *
+ * Note that the API can also be used with context-less kernel ::CUkernel
+ * by querying the handle using ::cuLibraryGetKernel() and then passing it
+ * to the API by casting to ::CUfunction. Here, the context to use for calculations
+ * will either be taken from the specified stream \p config->hStream
+ * or the current context in case of NULL stream.
+ *
+ * \param numClusters - Returned maximum number of clusters that
+ *                      could co-exist on the target device
+ * \param func        - Kernel function for which maximum number
+ *                      of clusters are calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CLUSTER_SIZE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaFuncGetAttributes,
+ * ::cuFuncGetAttribute
+ */
+CUresult CUDAAPI cuOccupancyMaxActiveClusters(int *numClusters, CUfunction func, const CUlaunchConfig *config);
+/** @} */ /* END CUDA_OCCUPANCY */
+
+/**
+ * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated texture reference management functions of the
+ * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated texture reference management
+ * functions of the low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Binds an array as a texture reference
+ *
+ * \deprecated
+ *
+ * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any
+ * previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. \p Flags must be set to
+ * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is
+ * unbound.
+ *
+ * \param hTexRef - Texture reference to bind
+ * \param hArray  - Array to bind
+ * \param Flags   - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+
+/**
+ * \brief Binds a mipmapped array to a texture reference
+ *
+ * \deprecated
+ *
+ * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef.
+ * Any previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT.
+ * Any CUDA array previously bound to \p hTexRef is unbound.
+ *
+ * \param hTexRef         - Texture reference to bind
+ * \param hMipmappedArray - Mipmapped array to bind
+ * \param Flags           - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
+
+/**
+ * \brief Binds an address as a texture reference
+ *
+ * \deprecated
+ *
+ * Binds a linear address range to the texture reference \p hTexRef. Any
+ * previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. Any memory previously bound to \p hTexRef
+ * is unbound.
+ *
+ * Since the hardware enforces an alignment requirement on texture base
+ * addresses, ::cuTexRefSetAddress() passes back a byte offset in
+ * \p *ByteOffset that must be applied to texture fetches in order to read from
+ * the desired memory. This offset must be divided by the texel size and
+ * passed to kernels that read from the texture so they can be applied to the
+ * ::tex1Dfetch() function.
+ *
+ * If the device memory pointer was returned from ::cuMemAlloc(), the offset
+ * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter.
+ *
+ * The total number of elements (or texels) in the linear address range
+ * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH.
+ * The number of elements is computed as (\p bytes / bytesPerElement),
+ * where bytesPerElement is determined from the data format and number of
+ * components set using ::cuTexRefSetFormat().
+ *
+ * \param ByteOffset - Returned byte offset
+ * \param hTexRef    - Texture reference to bind
+ * \param dptr       - Device pointer to bind
+ * \param bytes      - Size of memory to bind in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
+
+/**
+ * \brief Binds an address as a 2D texture reference
+ *
+ * \deprecated
+ *
+ * Binds a linear address range to the texture reference \p hTexRef. Any
+ * previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. Any memory previously bound to \p hTexRef
+ * is unbound.
+ *
+ * Using a ::tex2D() function inside a kernel requires a call to either
+ * ::cuTexRefSetArray() to bind the corresponding texture reference to an
+ * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear
+ * memory.
+ *
+ * Function calls to ::cuTexRefSetFormat() cannot follow calls to
+ * ::cuTexRefSetAddress2D() for the same texture reference.
+ *
+ * It is required that \p dptr be aligned to the appropriate hardware-specific
+ * texture alignment. You can query this value using the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is
+ * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \p Pitch has to be aligned to the hardware-specific texture pitch alignment.
+ * This value can be queried using the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is
+ * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * Width and Height, which are specified in elements (or texels), cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
+ * \p Pitch, which is specified in bytes, cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
+ *
+ * \param hTexRef - Texture reference to bind
+ * \param desc    - Descriptor of CUDA array
+ * \param dptr    - Device pointer to bind
+ * \param Pitch   - Line pitch in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+
+/**
+ * \brief Sets the format for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the format of the data to be read by the texture reference
+ * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the
+ * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure:
+ * They specify the format of each component and the number of components per
+ * array element.
+ *
+ * \param hTexRef             - Texture reference
+ * \param fmt                 - Format to set
+ * \param NumPackedComponents - Number of components per array element
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaCreateChannelDesc
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
+
+/**
+ * \brief Sets the addressing mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the addressing mode \p am for the given dimension \p dim of the
+ * texture reference \p hTexRef. If \p dim is zero, the addressing mode is
+ * applied to the first parameter of the functions used to fetch from the
+ * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined
+ * as:
+ * \code
+   typedef enum CUaddress_mode_enum {
+      CU_TR_ADDRESS_MODE_WRAP = 0,
+      CU_TR_ADDRESS_MODE_CLAMP = 1,
+      CU_TR_ADDRESS_MODE_MIRROR = 2,
+      CU_TR_ADDRESS_MODE_BORDER = 3
+   } CUaddress_mode;
+ * \endcode
+ *
+ * Note that this call has no effect if \p hTexRef is bound to linear memory.
+ * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only
+ * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
+ *
+ * \param hTexRef - Texture reference
+ * \param dim     - Dimension
+ * \param am      - Addressing mode to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
+
+/**
+ * \brief Sets the filtering mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the filtering mode \p fm to be used when reading memory through
+ * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
+ *
+ * \code
+   typedef enum CUfilter_mode_enum {
+      CU_TR_FILTER_MODE_POINT = 0,
+      CU_TR_FILTER_MODE_LINEAR = 1
+   } CUfilter_mode;
+ * \endcode
+ *
+ * Note that this call has no effect if \p hTexRef is bound to linear memory.
+ *
+ * \param hTexRef - Texture reference
+ * \param fm      - Filtering mode to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+
+/**
+ * \brief Sets the mipmap filtering mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the mipmap filtering mode \p fm to be used when reading memory through
+ * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
+ *
+ * \code
+   typedef enum CUfilter_mode_enum {
+      CU_TR_FILTER_MODE_POINT = 0,
+      CU_TR_FILTER_MODE_LINEAR = 1
+   } CUfilter_mode;
+ * \endcode
+ *
+ * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+ *
+ * \param hTexRef - Texture reference
+ * \param fm      - Filtering mode to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+
+/**
+ * \brief Sets the mipmap level bias for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when
+ * reading memory through the texture reference \p hTexRef.
+ *
+ * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+ *
+ * \param hTexRef - Texture reference
+ * \param bias    - Mipmap level bias
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias);
+
+/**
+ * \brief Sets the mipmap min/max mipmap level clamps for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp
+ * respectively, to be used when reading memory through the texture reference
+ * \p hTexRef.
+ *
+ * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+ *
+ * \param hTexRef        - Texture reference
+ * \param minMipmapLevelClamp - Mipmap min level clamp
+ * \param maxMipmapLevelClamp - Mipmap max level clamp
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
+
+/**
+ * \brief Sets the maximum anisotropy for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through
+ * the texture reference \p hTexRef.
+ *
+ * Note that this call has no effect if \p hTexRef is bound to linear memory.
+ *
+ * \param hTexRef  - Texture reference
+ * \param maxAniso - Maximum anisotropy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso);
+
+/**
+ * \brief Sets the border color for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference
+ * \p hTexRef. The color value supports only float type and holds color components in
+ * the following sequence:
+ * pBorderColor[0] holds 'R' component
+ * pBorderColor[1] holds 'G' component
+ * pBorderColor[2] holds 'B' component
+ * pBorderColor[3] holds 'A' component
+ *
+ * Note that the color values can be set only when the Address mode is set to
+ * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode.
+ * Applications using integer border color values have to "reinterpret_cast" their values to float.
+ *
+ * \param hTexRef       - Texture reference
+ * \param pBorderColor  - RGBA color
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddressMode,
+ * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor);
+
+/**
+ * \brief Sets the flags for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies optional flags via \p Flags to specify the behavior of data
+ * returned through the texture reference \p hTexRef. The valid flags are:
+ *
+ * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
+ *   having the texture promote integer data to floating point data in the
+ *   range [0, 1]. Note that texture with 32-bit integer format
+ *   would not be promoted, regardless of whether or not this
+ *   flag is specified;
+ * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the
+ *   default behavior of having the texture coordinates range
+ *   from [0, Dim) where Dim is the width or height of the CUDA
+ *   array. Instead, the texture coordinates [0, 1.0) reference
+ *   the entire breadth of the array dimension;
+ * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear
+ *   filtering optimizations. Trilinear optimizations improve texture filtering
+ *   performance by allowing bilinear filtering on textures in scenarios where
+ *   it can closely approximate the expected results.
+ *
+ * \param hTexRef - Texture reference
+ * \param Flags   - Optional flags to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
+
+/**
+ * \brief Gets the address associated with a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pdptr the base address bound to the texture reference
+ * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ * is not bound to any device memory range.
+ *
+ * \param pdptr   - Returned device address
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
+
+/**
+ * \brief Gets the array bound to a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *phArray the CUDA array bound to the texture reference
+ * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ * is not bound to any CUDA array.
+ *
+ * \param phArray - Returned array
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
+
+/**
+ * \brief Gets the mipmapped array bound to a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture
+ * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ * is not bound to any CUDA mipmapped array.
+ *
+ * \param phMipmappedArray - Returned mipmapped array
+ * \param hTexRef          - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
+
+/**
+ * \brief Gets the addressing mode used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pam the addressing mode corresponding to the
+ * dimension \p dim of the texture reference \p hTexRef. Currently, the only
+ * valid value for \p dim are 0 and 1.
+ *
+ * \param pam     - Returned addressing mode
+ * \param hTexRef - Texture reference
+ * \param dim     - Dimension
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
+
+/**
+ * \brief Gets the filter-mode used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pfm the filtering mode of the texture reference
+ * \p hTexRef.
+ *
+ * \param pfm     - Returned filtering mode
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+
+/**
+ * \brief Gets the format used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pFormat and \p *pNumChannels the format and number
+ * of components of the CUDA array bound to the texture reference \p hTexRef.
+ * If \p pFormat or \p pNumChannels is NULL, it will be ignored.
+ *
+ * \param pFormat      - Returned format
+ * \param pNumChannels - Returned number of components
+ * \param hTexRef      - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
+
+/**
+ * \brief Gets the mipmap filtering mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the mipmap filtering mode in \p pfm that's used when reading memory through
+ * the texture reference \p hTexRef.
+ *
+ * \param pfm     - Returned mipmap filtering mode
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+
+/**
+ * \brief Gets the mipmap level bias for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the mipmap level bias in \p pBias that's added to the specified mipmap
+ * level when reading memory through the texture reference \p hTexRef.
+ *
+ * \param pbias   - Returned mipmap level bias
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef);
+
+/**
+ * \brief Gets the min/max mipmap level clamps for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp
+ * that's used when reading memory through the texture reference \p hTexRef.
+ *
+ * \param pminMipmapLevelClamp - Returned mipmap min level clamp
+ * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp
+ * \param hTexRef              - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
+
+/**
+ * \brief Gets the maximum anisotropy for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through
+ * the texture reference \p hTexRef.
+ *
+ * \param pmaxAniso - Returned maximum anisotropy
+ * \param hTexRef   - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef);
+
+/**
+ * \brief Gets the border color used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p pBorderColor, values of the RGBA color used by
+ * the texture reference \p hTexRef.
+ * The color value is of type float and holds color components in
+ * the following sequence:
+ * pBorderColor[0] holds 'R' component
+ * pBorderColor[1] holds 'G' component
+ * pBorderColor[2] holds 'B' component
+ * pBorderColor[3] holds 'A' component
+ *
+ * \param hTexRef  - Texture reference
+ * \param pBorderColor   - Returned Type and Value of RGBA color
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef);
+
+/**
+ * \brief Gets the flags used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pFlags the flags of the texture reference \p hTexRef.
+ *
+ * \param pFlags  - Returned flags
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
+
+/**
+ * \brief Creates a texture reference
+ *
+ * \deprecated
+ *
+ * Creates a texture reference and returns its handle in \p *pTexRef. Once
+ * created, the application must call ::cuTexRefSetArray() or
+ * ::cuTexRefSetAddress() to associate the reference with allocated memory.
+ * Other texture reference functions are used to specify the format and
+ * interpretation (addressing, filtering, etc.) to be used when the memory is
+ * read through this texture reference.
+ *
+ * \param pTexRef - Returned texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefDestroy
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef);
+
+/**
+ * \brief Destroys a texture reference
+ *
+ * \deprecated
+ *
+ * Destroys the texture reference specified by \p hTexRef.
+ *
+ * \param hTexRef - Texture reference to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefCreate
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef);
+
+/** @} */ /* END CUDA_TEXREF_DEPRECATED */
+
+
+/**
+ * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ surface reference management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the surface reference management functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Sets the CUDA array for a surface reference.
+ *
+ * \deprecated
+ *
+ * Sets the CUDA array \p hArray to be read and written by the surface reference
+ * \p hSurfRef.  Any previous CUDA array state associated with the surface
+ * reference is superseded by this function.  \p Flags must be set to 0.
+ * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array.
+ * Any CUDA array previously bound to \p hSurfRef is unbound.
+
+ * \param hSurfRef - Surface reference handle
+ * \param hArray - CUDA array handle
+ * \param Flags - set to 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuModuleGetSurfRef,
+ * ::cuSurfRefGetArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
+
+/**
+ * \brief Passes back the CUDA array bound to a surface reference.
+ *
+ * \deprecated
+ *
+ * Returns in \p *phArray the CUDA array bound to the surface reference
+ * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference
+ * is not bound to any CUDA array.
+
+ * \param phArray - Surface reference handle
+ * \param hSurfRef - Surface reference handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
+
+/** @} */ /* END CUDA_SURFREF_DEPRECATED */
+
+/**
+ * \defgroup CUDA_TEXOBJECT Texture Object Management
+ *
+ * ___MANBRIEF___ texture object management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the texture object management functions of the
+ * low-level CUDA driver application programming interface. The texture
+ * object API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a texture object
+ *
+ * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes
+ * the data to texture from. \p pTexDesc describes how the data should be sampled.
+ * \p pResViewDesc is an optional argument that specifies an alternate format for
+ * the data described by \p pResDesc, and also describes the subresource region
+ * to restrict access to when texturing. \p pResViewDesc can only be specified if
+ * the type of resource is a CUDA array or a CUDA mipmapped array not in a block
+ * compressed format.
+ *
+ * Texture objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a texture object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * The ::CUDA_RESOURCE_DESC structure is defined as:
+ * \code
+        typedef struct CUDA_RESOURCE_DESC_st
+        {
+            CUresourcetype resType;
+
+            union {
+                struct {
+                    CUarray hArray;
+                } array;
+                struct {
+                    CUmipmappedArray hMipmappedArray;
+                } mipmap;
+                struct {
+                    CUdeviceptr devPtr;
+                    CUarray_format format;
+                    unsigned int numChannels;
+                    size_t sizeInBytes;
+                } linear;
+                struct {
+                    CUdeviceptr devPtr;
+                    CUarray_format format;
+                    unsigned int numChannels;
+                    size_t width;
+                    size_t height;
+                    size_t pitchInBytes;
+                } pitch2D;
+            } res;
+
+            unsigned int flags;
+        } CUDA_RESOURCE_DESC;
+
+ * \endcode
+ * where:
+ * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from.
+ * CUresourceType is defined as:
+ * \code
+        typedef enum CUresourcetype_enum {
+            CU_RESOURCE_TYPE_ARRAY           = 0x00,
+            CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01,
+            CU_RESOURCE_TYPE_LINEAR          = 0x02,
+            CU_RESOURCE_TYPE_PITCH2D         = 0x03
+        } CUresourcetype;
+ * \endcode
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray
+ * must be set to a valid CUDA array handle.
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray
+ * must be set to a valid CUDA mipmapped array handle.
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr
+ * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
+ * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels
+ * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes
+ * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)).
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr
+ * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
+ * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels
+ * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width
+ * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
+ * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to
+ * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
+ *
+ * - ::flags must be set to zero.
+ *
+ *
+ * The ::CUDA_TEXTURE_DESC struct is defined as
+ * \code
+        typedef struct CUDA_TEXTURE_DESC_st {
+            CUaddress_mode addressMode[3];
+            CUfilter_mode filterMode;
+            unsigned int flags;
+            unsigned int maxAnisotropy;
+            CUfilter_mode mipmapFilterMode;
+            float mipmapLevelBias;
+            float minMipmapLevelClamp;
+            float maxMipmapLevelClamp;
+        } CUDA_TEXTURE_DESC;
+ * \endcode
+ * where
+ * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as:
+ *   \code
+        typedef enum CUaddress_mode_enum {
+            CU_TR_ADDRESS_MODE_WRAP = 0,
+            CU_TR_ADDRESS_MODE_CLAMP = 1,
+            CU_TR_ADDRESS_MODE_MIRROR = 2,
+            CU_TR_ADDRESS_MODE_BORDER = 3
+        } CUaddress_mode;
+ *   \endcode
+ *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES
+ *   is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
+ *
+ * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as:
+ *   \code
+        typedef enum CUfilter_mode_enum {
+            CU_TR_FILTER_MODE_POINT = 0,
+            CU_TR_FILTER_MODE_LINEAR = 1
+        } CUfilter_mode;
+ *   \endcode
+ *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR.
+ *
+ * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following:
+ *   - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
+ *   having the texture promote integer data to floating point data in the
+ *   range [0, 1]. Note that texture with 32-bit integer format would not be 
+ *   promoted, regardless of whether or not this flag is specified.
+ *   - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior
+ *   of having the texture coordinates range from [0, Dim) where Dim is the 
+ *   width or height of the CUDA array. Instead, the texture coordinates 
+ *   [0, 1.0) reference the entire breadth of the array dimension; Note that
+ *   for CUDA mipmapped arrays, this flag has to be set.
+ *   - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear
+ *   filtering optimizations. Trilinear optimizations improve texture filtering
+ *   performance by allowing bilinear filtering on textures in scenarios where
+ *   it can closely approximate the expected results.
+ *   - ::CU_TRSF_SEAMLESS_CUBEMAP, which enables seamless cube map filtering. 
+ *   This flag can only be specified if the underlying resource is a CUDA array 
+ *   or a CUDA mipmapped array that was created with the flag ::CUDA_ARRAY3D_CUBEMAP.
+ *   When seamless cube map filtering is enabled, texture address modes specified 
+ *   by ::CUDA_TEXTURE_DESC::addressMode are ignored. Instead, if the ::CUDA_TEXTURE_DESC::filterMode 
+ *   is set to ::CU_TR_FILTER_MODE_POINT the address mode ::CU_TR_ADDRESS_MODE_CLAMP 
+ *   will be applied for all dimensions. If the ::CUDA_TEXTURE_DESC::filterMode is 
+ *   set to ::CU_TR_FILTER_MODE_LINEAR seamless cube map filtering will be performed
+ *   when sampling along the cube face borders.
+ *
+ * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be
+ *   clamped to the range [1,16].
+ *
+ * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels.
+ *
+ * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level.
+ *
+ * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to.
+ *
+ * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to.
+ *
+ *
+ * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as
+ * \code
+        typedef struct CUDA_RESOURCE_VIEW_DESC_st
+        {
+            CUresourceViewFormat format;
+            size_t width;
+            size_t height;
+            size_t depth;
+            unsigned int firstMipmapLevel;
+            unsigned int lastMipmapLevel;
+            unsigned int firstLayer;
+            unsigned int lastLayer;
+        } CUDA_RESOURCE_VIEW_DESC;
+ * \endcode
+ * where:
+ * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should
+ *   be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block
+ *   compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32.
+ *   with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have
+ *   a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base
+ *   format but with 4 channels.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the
+ *   original resource.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero.
+ *   For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp
+ *   will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified,
+ *   then the actual minimum mipmap level clamp will be 3.2.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value
+ *   has to be zero.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero.
+ *   For non-layered resources, this value has to be zero.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources,
+ *   this value has to be zero.
+ *
+ *
+ * \param pTexObject   - Texture object to create
+ * \param pResDesc     - Resource descriptor
+ * \param pTexDesc     - Texture descriptor
+ * \param pResViewDesc - Resource view descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectDestroy,
+ * ::cudaCreateTextureObject
+ */
+CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc);
+
+/**
+ * \brief Destroys a texture object
+ *
+ * Destroys the texture object specified by \p texObject.
+ *
+ * \param texObject - Texture object to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaDestroyTextureObject
+ */
+CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject);
+
+/**
+ * \brief Returns a texture object's resource descriptor
+ *
+ * Returns the resource descriptor for the texture object specified by \p texObject.
+ *
+ * \param pResDesc  - Resource descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaGetTextureObjectResourceDesc,
+ */
+CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject);
+
+/**
+ * \brief Returns a texture object's texture descriptor
+ *
+ * Returns the texture descriptor for the texture object specified by \p texObject.
+ *
+ * \param pTexDesc  - Texture descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaGetTextureObjectTextureDesc
+ */
+CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject);
+
+/**
+ * \brief Returns a texture object's resource view descriptor
+ *
+ * Returns the resource view descriptor for the texture object specified by \p texObject.
+ * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param pResViewDesc - Resource view descriptor
+ * \param texObject    - Texture object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaGetTextureObjectResourceViewDesc
+ */
+CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject);
+
+/** @} */ /* END CUDA_TEXOBJECT */
+
+/**
+ * \defgroup CUDA_SURFOBJECT Surface Object Management
+ *
+ * ___MANBRIEF___ surface object management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the surface object management functions of the
+ * low-level CUDA driver application programming interface. The surface
+ * object API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a surface object
+ *
+ * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes
+ * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be
+ * ::CU_RESOURCE_TYPE_ARRAY and  ::CUDA_RESOURCE_DESC::res::array::hArray
+ * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero.
+ *
+ * Surface objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a surface object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * \param pSurfObject - Surface object to create
+ * \param pResDesc    - Resource descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuSurfObjectDestroy,
+ * ::cudaCreateSurfaceObject
+ */
+CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc);
+
+/**
+ * \brief Destroys a surface object
+ *
+ * Destroys the surface object specified by \p surfObject.
+ *
+ * \param surfObject - Surface object to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuSurfObjectCreate,
+ * ::cudaDestroySurfaceObject
+ */
+CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject);
+
+/**
+ * \brief Returns a surface object's resource descriptor
+ *
+ * Returns the resource descriptor for the surface object specified by \p surfObject.
+ *
+ * \param pResDesc   - Resource descriptor
+ * \param surfObject - Surface object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuSurfObjectCreate,
+ * ::cudaGetSurfaceObjectResourceDesc
+ */
+CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject);
+
+/** @} */ /* END CUDA_SURFOBJECT */
+
+/**
+ * \defgroup CUDA_TENSOR_MEMORY Tensor Map Object Managment
+ *
+ * ___MANBRIEF___ tensor map object management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the tensor map object management functions of the
+ * low-level CUDA driver application programming interface. The tensor
+ * core API is only supported on devices of compute capability 9.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Create a tensor map descriptor object representing tiled memory region
+ *
+ * Creates a descriptor for Tensor Memory Access (TMA) object specified
+ * by the parameters describing a tiled region and returns it in \p tensorMap.
+ *
+ * Tensor map objects are only supported on devices of compute capability 9.0 or higher.
+ * Additionally, a tensor map object is an opaque value, and, as such, should only be
+ * accessed through CUDA APIs and PTX.
+ *
+ * The parameters passed are bound to the following requirements:
+ *
+ * - \p tensorMap address must be aligned to 64 bytes.
+ *
+ * - \p tensorDataType has to be an enum from ::CUtensorMapDataType which is defined as:
+ * \code
+    typedef enum CUtensorMapDataType_enum {
+        CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0,       // 1 byte
+        CU_TENSOR_MAP_DATA_TYPE_UINT16,          // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_UINT32,          // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_INT32,           // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_UINT64,          // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_INT64,           // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT16,         // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT32,         // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT64,         // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,        // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,     // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,        // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ,    // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,    // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,   // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B    // 6 bits
+    } CUtensorMapDataType;
+ * \endcode
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
+ *
+ * - \p tensorRank must be non-zero and less than or equal to the maximum supported dimensionality of 5. If \p interleave is not
+ * ::CU_TENSOR_MAP_INTERLEAVE_NONE, then \p tensorRank must additionally be greater than or equal to 3.
+ *
+ * - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
+ *
+  * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
+ * equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
+ *    - Dimension for the packed data types must reflect the number of individual U# values.
+ *
+ * - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
+ * multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
+ * Each following dimension specified includes previous dimension stride:
+ * \code
+    globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
+    for (i = 1; i < tensorRank - 1; i++)
+        globalStrides[i] = globalStrides[i – 1] * (globalDim[i] + padding[i]);
+        assert(globalStrides[i] >= globalDim[i]);
+ * \endcode
+ *
+ * - \p boxDim array, which specifies number of elements to be traversed along each of the \p tensorRank dimensions, must be non-zero
+ * and less than or equal to 256. Additionally, the following requirements need to be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, { \p boxDim[0] * elementSizeInBytes( \p tensorDataType ) } must be a multiple of 16 bytes.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, boxDim[0] must be 128.
+ *
+ * - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
+ * than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
+ * TMA doesn’t support the stride for dimension zero.
+ * When all elements of \p elementStrides array is one, \p boxDim specifies the number of elements to load. However, if the \p elementStrides[i]
+ * is not equal to one, then TMA loads ceil( \p boxDim[i] / \p elementStrides[i]) number of elements along i-th dimension. To load N elements along
+ * i-th dimension, \p boxDim[i] must be set to N * \p elementStrides[i].
+ *
+ * - \p interleave specifies the interleaved layout of type ::CUtensorMapInterleave, which is defined as:
+ * \code
+    typedef enum CUtensorMapInterleave_enum {
+        CU_TENSOR_MAP_INTERLEAVE_NONE = 0,
+        CU_TENSOR_MAP_INTERLEAVE_16B,
+        CU_TENSOR_MAP_INTERLEAVE_32B
+    } CUtensorMapInterleave;
+ * \endcode
+ * TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
+ * uses 32 bytes.
+ * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension
+ * (computed as \p boxDim[0] multiplied by element size derived from \p tensorDataType) must be less than or equal to the swizzle size.
+ *    - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension to be <= 32.
+ *    - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
+ *    - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
+ * Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
+ *
+ * - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as:
+ * \code
+    typedef enum CUtensorMapSwizzle_enum {
+        CU_TENSOR_MAP_SWIZZLE_NONE = 0,
+        CU_TENSOR_MAP_SWIZZLE_32B,                   // Swizzle 16B chunks within 32B  span
+        CU_TENSOR_MAP_SWIZZLE_64B,                   // Swizzle 16B chunks within 64B  span
+        CU_TENSOR_MAP_SWIZZLE_128B,                  // Swizzle 16B chunks within 128B span
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,         // Swizzle 32B chunks within 128B span
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B, // Swizzle 32B chunks within 128B span, additionally swap lower 8B with upper 8B within each 16B for every alternate row
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B          // Swizzle 64B chunks within 128B span
+    } CUtensorMapSwizzle;
+ * \endcode
+ * Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
+ * in shared memory. This difference in data organization may cause bank conflicts when shared memory is accessed. In order to avoid this
+ * problem, data can be loaded to shared memory with shuffling across shared memory banks.
+ * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle must be ::CU_TENSOR_MAP_SWIZZLE_32B.
+ * Other interleave modes can have any swizzling pattern.
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_NONE (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only)
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_NONE (Load only)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
+ *
+ * - \p l2Promotion specifies L2 fetch size which indicates the byte granurality at which L2 requests is filled from DRAM. It must be of
+ * type ::CUtensorMapL2promotion, which is defined as:
+ * \code
+    typedef enum CUtensorMapL2promotion_enum {
+        CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_64B,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+    } CUtensorMapL2promotion;
+ * \endcode
+ *
+ * - \p oobFill, which indicates whether zero or a special NaN constant should be used to fill out-of-bound elements, must be of type
+ * ::CUtensorMapFloatOOBfill which is defined as:
+ * \code
+    typedef enum CUtensorMapFloatOOBfill_enum {
+        CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
+        CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+    } CUtensorMapFloatOOBfill;
+ * \endcode
+ * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
+ * and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
+ *
+ * \param tensorMap         - Tensor map object to create
+ * \param tensorDataType    - Tensor data type
+ * \param tensorRank        - Dimensionality of tensor
+ * \param globalAddress     - Starting address of memory region described by tensor
+ * \param globalDim         - Array containing tensor size (number of elements) along each of the \p tensorRank dimensions
+ * \param globalStrides     - Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions
+ * \param boxDim            - Array containing traversal box size (number of elments) along each of the \p tensorRank dimensions. Specifies how many elements to be traversed along each tensor dimension.
+ * \param elementStrides    - Array containing traversal stride in each of the \p tensorRank dimensions
+ * \param interleave        - Type of interleaved layout the tensor addresses
+ * \param swizzle           - Bank swizzling pattern inside shared memory
+ * \param l2Promotion       - L2 promotion size
+ * \param oobFill           - Indicate whether zero or special NaN constant must be used to fill out-of-bound elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTensorMapEncodeIm2col,
+ * ::cuTensorMapEncodeIm2colWide,
+ * ::cuTensorMapReplaceAddress
+ */
+CUresult CUDAAPI cuTensorMapEncodeTiled(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const cuuint32_t *boxDim, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+
+/**
+ * \brief Create a tensor map descriptor object representing im2col memory region
+ *
+ * Creates a descriptor for Tensor Memory Access (TMA) object specified
+ * by the parameters describing a im2col memory layout and returns it in \p tensorMap.
+ *
+ * Tensor map objects are only supported on devices of compute capability 9.0 or higher.
+ * Additionally, a tensor map object is an opaque value, and, as such, should only be
+ * accessed through CUDA APIs and PTX.
+ *
+ * The parameters passed are bound to the following requirements:
+ *
+ * - \p tensorMap address must be aligned to 64 bytes.
+ *
+ * - \p tensorDataType has to be an enum from ::CUtensorMapDataType which is defined as:
+ * \code
+    typedef enum CUtensorMapDataType_enum {
+        CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0,       // 1 byte
+        CU_TENSOR_MAP_DATA_TYPE_UINT16,          // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_UINT32,          // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_INT32,           // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_UINT64,          // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_INT64,           // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT16,         // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT32,         // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT64,         // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,        // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,     // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,        // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ     // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,    // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,   // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B    // 6 bits
+    } CUtensorMapDataType;
+ * \endcode
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
+ *
+ * - \p tensorRank, which specifies the number of tensor dimensions, must be 3, 4, or 5.
+ *
+ * - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
+ *
+ * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
+ * equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
+ *    - Dimension for the packed data types must reflect the number of individual U# values.
+ *
+ * - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
+ * multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
+ * Each following dimension specified includes previous dimension stride:
+ * \code
+    globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
+    for (i = 1; i < tensorRank - 1; i++)
+        globalStrides[i] = globalStrides[i – 1] * (globalDim[i] + padding[i]);
+        assert(globalStrides[i] >= globalDim[i]);
+ * \endcode
+ *
+ * - \p pixelBoxLowerCorner array specifies the coordinate offsets {D, H, W} of the bounding box from top/left/front corner. The number of
+ * offsets and their precision depend on the tensor dimensionality:
+ *    - When \p tensorRank is 3, one signed offset within range [-32768, 32767] is supported.
+ *    - When \p tensorRank is 4, two signed offsets each within range [-128, 127] are supported.
+ *    - When \p tensorRank is 5, three offsets each within range [-16, 15] are supported.
+ *
+ * - \p pixelBoxUpperCorner array specifies the coordinate offsets {D, H, W} of the bounding box from bottom/right/back corner. The number of
+ * offsets and their precision depend on the tensor dimensionality:
+ *    - When \p tensorRank is 3, one signed offset within range [-32768, 32767] is supported.
+ *    - When \p tensorRank is 4, two signed offsets each within range [-128, 127] are supported.
+ *    - When \p tensorRank is 5, three offsets each within range [-16, 15] are supported.
+ * The bounding box specified by \p pixelBoxLowerCorner and \p pixelBoxUpperCorner must have non-zero area.
+ *
+ * - \p channelsPerPixel, which specifies the number of elements which must be accessed along C dimension, must be less than or equal to 256.
+ * Additionally, when \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p channelsPerPixel must be 128.
+ *
+ * - \p pixelsPerColumn, which specifies the number of elements that must be accessed along the {N, D, H, W} dimensions, must be less than or
+ * equal to 1024.
+ *
+ * - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
+ * than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
+ * TMA doesn’t support the stride for dimension zero.
+ * When all elements of the \p elementStrides array are one, \p boxDim specifies the number of elements to load. However, if \p elementStrides[i]
+ * is not equal to one for some \p i, then TMA loads ceil( \p boxDim[i] / \p elementStrides[i]) number of elements along i-th dimension.
+ * To load N elements along i-th dimension, \p boxDim[i] must be set to N * \p elementStrides[i].
+ *
+ * - \p interleave specifies the interleaved layout of type ::CUtensorMapInterleave, which is defined as:
+ * \code
+    typedef enum CUtensorMapInterleave_enum {
+        CU_TENSOR_MAP_INTERLEAVE_NONE = 0,
+        CU_TENSOR_MAP_INTERLEAVE_16B,
+        CU_TENSOR_MAP_INTERLEAVE_32B
+    } CUtensorMapInterleave;
+ * \endcode
+ * TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
+ * uses 32 bytes.
+ * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE and \p swizzle is not ::CU_TENSOR_MAP_SWIZZLE_NONE, the bounding box inner dimension
+ * (computed as \p channelsPerPixel multiplied by element size in bytes derived from \p tensorDataType) must be less than or equal to the swizzle size.
+ *    - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension to be <= 32.
+ *    - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
+ *    - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
+ * Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
+ *
+ * - \p swizzle, which specifies the shared memory bank swizzling pattern, has to be of type ::CUtensorMapSwizzle which is defined as:
+ * \code
+    typedef enum CUtensorMapSwizzle_enum {
+        CU_TENSOR_MAP_SWIZZLE_NONE = 0,
+        CU_TENSOR_MAP_SWIZZLE_32B,                   // Swizzle 16B chunks within 32B  span
+        CU_TENSOR_MAP_SWIZZLE_64B,                   // Swizzle 16B chunks within 64B  span
+        CU_TENSOR_MAP_SWIZZLE_128B,                  // Swizzle 16B chunks within 128B span
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,         // Swizzle 32B chunks within 128B span
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B, // Swizzle 32B chunks within 128B span, additionally swap lower 8B with upper 8B within each 16B for every alternate row
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B          // Swizzle 64B chunks within 128B span
+    } CUtensorMapSwizzle;
+ * \endcode
+ * Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
+ * in shared memory. This difference in data organization may cause bank conflicts when shared memory is accessed. In order to avoid this
+ * problem, data can be loaded to shared memory with shuffling across shared memory banks.
+ * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p swizzle must be ::CU_TENSOR_MAP_SWIZZLE_32B.
+ * Other interleave modes can have any swizzling pattern.
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_NONE (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only)
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_NONE (Load only)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
+ *
+ * - \p l2Promotion specifies L2 fetch size which indicates the byte granularity at which L2 requests are filled from DRAM. It must be of
+ * type ::CUtensorMapL2promotion, which is defined as:
+ * \code
+    typedef enum CUtensorMapL2promotion_enum {
+        CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_64B,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+    } CUtensorMapL2promotion;
+ * \endcode
+ *
+ * - \p oobFill, which indicates whether zero or a special NaN constant should be used to fill out-of-bound elements, must be of type
+ * ::CUtensorMapFloatOOBfill which is defined as:
+ * \code
+    typedef enum CUtensorMapFloatOOBfill_enum {
+        CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
+        CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+    } CUtensorMapFloatOOBfill;
+ * \endcode
+ * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
+ * and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
+ *
+ * \param tensorMap             - Tensor map object to create
+ * \param tensorDataType        - Tensor data type
+ * \param tensorRank            - Dimensionality of tensor; must be at least 3
+ * \param globalAddress         - Starting address of memory region described by tensor
+ * \param globalDim             - Array containing tensor size (number of elements) along each of the \p tensorRank dimensions
+ * \param globalStrides         - Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions
+ * \param pixelBoxLowerCorner   - Array containing DHW dimensions of lower box corner
+ * \param pixelBoxUpperCorner   - Array containing DHW dimensions of upper box corner
+ * \param channelsPerPixel      - Number of channels per pixel
+ * \param pixelsPerColumn       - Number of pixels per column
+ * \param elementStrides        - Array containing traversal stride in each of the \p tensorRank dimensions
+ * \param interleave            - Type of interleaved layout the tensor addresses
+ * \param swizzle               - Bank swizzling pattern inside shared memory
+ * \param l2Promotion           - L2 promotion size
+ * \param oobFill               - Indicate whether zero or special NaN constant will be used to fill out-of-bound elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTensorMapEncodeTiled,
+ * ::cuTensorMapEncodeIm2colWide,
+ * ::cuTensorMapReplaceAddress
+ */
+CUresult CUDAAPI cuTensorMapEncodeIm2col(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const int *pixelBoxLowerCorner, const int *pixelBoxUpperCorner, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+
+/**
+ * \brief Create a tensor map descriptor object representing im2col memory region, but where
+ * the elements are exclusively loaded along the W dimension.
+ *
+ * Creates a descriptor for Tensor Memory Access (TMA) object specified by the parameters
+ * describing a im2col memory layout and where the row is always loaded along the W dimensuin
+ * and returns it in \p tensorMap. This assumes the tensor layout in memory is either NDHWC,
+ * NHWC, or NWC.
+ *
+ * This API is only supported on devices of compute capability 10.0 or higher.
+ * Additionally, a tensor map object is an opaque value, and, as such, should only be
+ * accessed through CUDA APIs and PTX.
+ *
+ * The parameters passed are bound to the following requirements:
+ *
+ * - \p tensorMap address must be aligned to 64 bytes.
+ *
+ * - \p tensorDataType has to be an enum from ::CUtensorMapDataType which is defined as:
+ * \code
+    typedef enum CUtensorMapDataType_enum {
+        CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0,       // 1 byte
+        CU_TENSOR_MAP_DATA_TYPE_UINT16,          // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_UINT32,          // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_INT32,           // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_UINT64,          // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_INT64,           // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT16,         // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT32,         // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT64,         // 8 bytes
+        CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,        // 2 bytes
+        CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,     // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,        // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ     // 4 bytes
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B,    // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B,   // 4 bits
+        CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B    // 6 bits
+    } CUtensorMapDataType;
+ * \endcode
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B copies '16 x U4' packed values to memory aligned as 8 bytes. There are no gaps between packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B copies '16 x U4' packed values to memory aligned as 16 bytes. There are 8 byte gaps between every 8 byte chunk of packed values.
+ *  ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B copies '16 x U6' packed values to memory aligned as 16 bytes. There are 4 byte gaps between every 12 byte chunk of packed values.
+ *
+ * - \p tensorRank, which specifies the number of tensor dimensions, must be 3, 4, or 5.
+ *
+ * - \p globalAddress, which specifies the starting address of the memory region described, must be 16 byte aligned. The following requirements need to also be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, \p globalAddress must be 32 byte aligned.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p globalAddress must be 32 byte aligned.
+ *
+  * - \p globalDim array, which specifies tensor size of each of the \p tensorRank dimensions, must be non-zero and less than or
+ * equal to 2^32. Additionally, the following requirements need to be met for the packed data types:
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, globalDim[0] must be a multiple of 128.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, \p globalDim[0] must be a multiple of 2.
+ *    - Dimension for the packed data types must reflect the number of individual U# values.
+ *
+ * - \p globalStrides array, which specifies tensor stride of each of the lower \p tensorRank - 1 dimensions in bytes, must be a
+ * multiple of 16 and less than 2^40. Additionally, the following requirements need to be met:
+ *    - When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_32B, the strides must be a multiple of 32.
+ *    - When \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, the strides must be a multiple of 32.
+ * Each following dimension specified includes previous dimension stride:
+ * \code
+    globalStrides[0] = globalDim[0] * elementSizeInBytes(tensorDataType) + padding[0];
+    for (i = 1; i < tensorRank - 1; i++)
+        globalStrides[i] = globalStrides[i – 1] * (globalDim[i] + padding[i]);
+        assert(globalStrides[i] >= globalDim[i]);
+ * \endcode
+ *
+ * - \p pixelBoxLowerCornerWidth specifies the coordinate offset W of the bounding box from left corner. The offset must be
+ * within range [-32768, 32767].
+ *
+ * - \p pixelBoxUpperCornerWidth specifies the coordinate offset W of the bounding box from right corner. The offset must be
+ * within range [-32768, 32767].
+ *
+ * The bounding box specified by \p pixelBoxLowerCornerWidth and \p pixelBoxUpperCornerWidth must have non-zero area. Note
+ * that the size of the box along D and H dimensions is always equal to one.
+ *
+ * - \p channelsPerPixel, which specifies the number of elements which must be accessed along C dimension, must be less than or equal to 256.
+ * Additionally, when \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B or ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, \p channelsPerPixel must be 128.
+ *
+ * - \p pixelsPerColumn, which specifies the number of elements that must be accessed along the W dimension, must be less than or
+ * equal to 1024. This field is ignored when \p mode is ::CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128.
+ *
+ * - \p elementStrides array, which specifies the iteration step along each of the \p tensorRank dimensions, must be non-zero and less
+ * than or equal to 8. Note that when \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the first element of this array is ignored since
+ * TMA doesn’t support the stride for dimension zero.
+ * When all elements of the \p elementStrides array are one, \p boxDim specifies the number of elements to load. However, if \p elementStrides[i]
+ * is not equal to one for some \p i, then TMA loads ceil( \p boxDim[i] / \p elementStrides[i]) number of elements along i-th dimension.
+ * To load N elements along i-th dimension, \p boxDim[i] must be set to N * \p elementStrides[i].
+ *
+ * - \p interleave specifies the interleaved layout of type ::CUtensorMapInterleave, which is defined as:
+ * \code
+    typedef enum CUtensorMapInterleave_enum {
+        CU_TENSOR_MAP_INTERLEAVE_NONE = 0,
+        CU_TENSOR_MAP_INTERLEAVE_16B,
+        CU_TENSOR_MAP_INTERLEAVE_32B
+    } CUtensorMapInterleave;
+ * \endcode
+ * TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16 bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
+ * uses 32 bytes.
+ * When \p interleave is ::CU_TENSOR_MAP_INTERLEAVE_NONE, the bounding box inner dimension (computed as \p channelsPerPixel multiplied by
+ * element size in bytes derived from \p tensorDataType) must be less than or equal to the swizzle size.
+ *    - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension to be <= 64.
+ *    - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner dimension to be <= 128.
+ * Additionally, \p tensorDataType of ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B requires \p interleave to be ::CU_TENSOR_MAP_INTERLEAVE_NONE.
+ *
+ * - \p mode, which describes loading of elements loaded along the W dimension, has to be one of the following ::CUtensorMapIm2ColWideMode types:
+ * \code
+ *          CU_TENSOR_MAP_IM2COL_WIDE_MODE_W,
+ *          CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128
+ * \endcode
+ * ::CU_TENSOR_MAP_IM2COL_WIDE_MODE_W allows the number of elements loaded along the W dimension to be specified
+ * via the \p pixelsPerColumn field.
+ *
+ * - \p swizzle, which specifies the shared memory bank swizzling pattern, must be one of the following
+ * ::CUtensorMapSwizzle modes (other swizzle modes are not supported):
+ * \code
+    typedef enum CUtensorMapSwizzle_enum {
+        CU_TENSOR_MAP_SWIZZLE_64B,                   // Swizzle 16B chunks within 64B  span
+        CU_TENSOR_MAP_SWIZZLE_128B,                  // Swizzle 16B chunks within 128B span
+        CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B,         // Swizzle 32B chunks within 128B span
+    } CUtensorMapSwizzle;
+ * \endcode
+ * Data are organized in a specific order in global memory; however, this may not match the order in which the application accesses data
+ * in shared memory. This difference in data organization may cause bank conflicts when shared memory is accessed. In order to avoid this
+ * problem, data can be loaded to shared memory with shuffling across shared memory banks.
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
+ * When the \p tensorDataType is ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, only the following swizzle modes are supported:
+ *    - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
+ *    - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
+ *
+ * - \p l2Promotion specifies L2 fetch size which indicates the byte granularity at which L2 requests are filled from DRAM. It must be of
+ * type ::CUtensorMapL2promotion, which is defined as:
+ * \code
+    typedef enum CUtensorMapL2promotion_enum {
+        CU_TENSOR_MAP_L2_PROMOTION_NONE = 0,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_64B,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_128B,
+        CU_TENSOR_MAP_L2_PROMOTION_L2_256B
+    } CUtensorMapL2promotion;
+ * \endcode
+ *
+ * - \p oobFill, which indicates whether zero or a special NaN constant should be used to fill out-of-bound elements, must be of type
+ * ::CUtensorMapFloatOOBfill which is defined as:
+ * \code
+    typedef enum CUtensorMapFloatOOBfill_enum {
+        CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE = 0,
+        CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+    } CUtensorMapFloatOOBfill;
+ * \endcode
+ * Note that ::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type,
+ * and when \p tensorDataType is not ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and ::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B.
+ *
+ * \param tensorMap                - Tensor map object to create
+ * \param tensorDataType           - Tensor data type
+ * \param tensorRank               - Dimensionality of tensor; must be at least 3
+ * \param globalAddress            - Starting address of memory region described by tensor
+ * \param globalDim                - Array containing tensor size (number of elements) along each of the \p tensorRank dimensions
+ * \param globalStrides            - Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions
+ * \param pixelBoxLowerCornerWidth - Width offset of left box corner
+ * \param pixelBoxUpperCornerWidth - Width offset of right box corner
+ * \param channelsPerPixel         - Number of channels per pixel
+ * \param pixelsPerColumn          - Number of pixels per column
+ * \param elementStrides           - Array containing traversal stride in each of the \p tensorRank dimensions
+ * \param interleave               - Type of interleaved layout the tensor addresses
+ * \param mode                     - W or W128 mode
+ * \param swizzle                  - Bank swizzling pattern inside shared memory
+ * \param l2Promotion              - L2 promotion size
+ * \param oobFill                  - Indicate whether zero or special NaN constant will be used to fill out-of-bound elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTensorMapEncodeTiled,
+ * ::cuTensorMapEncodeIm2col,
+ * ::cuTensorMapReplaceAddress
+ */
+CUresult CUDAAPI cuTensorMapEncodeIm2colWide(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, int pixelBoxLowerCornerWidth, int pixelBoxUpperCornerWidth, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapIm2ColWideMode mode, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+
+/**
+ * \brief Modify an existing tensor map descriptor with an updated global address
+ *
+ * Modifies the descriptor for Tensor Memory Access (TMA) object passed in \p tensorMap with
+ * an updated \p globalAddress.
+ *
+ * Tensor map objects are only supported on devices of compute capability 9.0 or higher.
+ * Additionally, a tensor map object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * \param tensorMap             - Tensor map object to modify
+ * \param globalAddress         - Starting address of memory region described by tensor, must follow previous alignment requirements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTensorMapEncodeTiled,
+ * ::cuTensorMapEncodeIm2col
+ * ::cuTensorMapEncodeIm2colWide
+ */
+CUresult CUDAAPI cuTensorMapReplaceAddress(CUtensorMap *tensorMap, void *globalAddress);
+
+/** @} */
+/* END CUDA_TENSOR_MEMORY */
+
+/**
+ * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access
+ *
+ * ___MANBRIEF___ direct peer context memory access functions of the low-level
+ * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the direct peer context memory access functions
+ * of the low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Queries if a device may directly access a peer device's memory.
+ *
+ * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of
+ * directly accessing memory from contexts on \p peerDev and 0 otherwise.
+ * If direct access of \p peerDev from \p dev is possible, then access may be
+ * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess().
+ *
+ * \param canAccessPeer - Returned access capability
+ * \param dev           - Device from which allocations on \p peerDev are to
+ *                        be directly accessed.
+ * \param peerDev       - Device on which the allocations to be directly accessed
+ *                        by \p dev reside.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxEnablePeerAccess,
+ * ::cuCtxDisablePeerAccess,
+ * ::cudaDeviceCanAccessPeer
+ */
+CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
+
+/**
+ * \brief Enables direct access to memory allocations in a peer context.
+ *
+ * If both the current context and \p peerContext are on devices which support unified
+ * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same
+ * major compute capability, then on success all allocations from \p peerContext will
+ * immediately be accessible by the current context.  See \ref CUDA_UNIFIED for additional
+ * details.
+ *
+ * Note that access granted by this call is unidirectional and that in order to access
+ * memory from the current context in \p peerContext, a separate symmetric call
+ * to ::cuCtxEnablePeerAccess() is required.
+ *
+ * Note that there are both device-wide and system-wide limitations per system
+ * configuration, as noted in the CUDA Programming Guide under the section
+ * "Peer-to-Peer Memory Access".
+ *
+ * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates
+ * that the ::CUdevice of the current context cannot directly access memory
+ * from the ::CUdevice of \p peerContext.
+ *
+ * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of
+ * \p peerContext from the current context has already been enabled.
+ *
+ * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible
+ * because hardware resources required for peer access have been exhausted.
+ *
+ * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext
+ * is not a valid context, or if the current context is \p peerContext.
+ *
+ * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0.
+ *
+ * \param peerContext - Peer context to enable direct access to from the current context
+ * \param Flags       - Reserved for future use and must be set to 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
+ * ::CUDA_ERROR_TOO_MANY_PEERS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceCanAccessPeer,
+ * ::cuCtxDisablePeerAccess,
+ * ::cudaDeviceEnablePeerAccess
+ */
+CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags);
+
+/**
+ * \brief Disables direct access to memory allocations in a peer context and
+ * unregisters any registered allocations.
+ *
+  Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
+ * not yet been enabled from \p peerContext to the current context.
+ *
+ * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if
+ * \p peerContext is not a valid context.
+ *
+ * \param peerContext - Peer context to disable direct access to
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceCanAccessPeer,
+ * ::cuCtxEnablePeerAccess,
+ * ::cudaDeviceDisablePeerAccess
+ */
+CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext);
+
+/**
+ * \brief Queries attributes of the link between two devices.
+ *
+ * Returns in \p *value the value of the requested attribute \p attrib of the
+ * link between \p srcDevice and \p dstDevice. The supported attributes are:
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the
+ *   performance of the link between two devices.
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable.
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over
+ *   the link are supported.
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can
+ *   be accessed over the link.
+ *
+ * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid
+ * or if they represent the same device.
+ *
+ * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is
+ * a null pointer.
+ *
+ * \param value         - Returned value of the requested attribute
+ * \param attrib        - The requested attribute of the link between \p srcDevice and \p dstDevice.
+ * \param srcDevice     - The source device of the target link.
+ * \param dstDevice     - The destination device of the target link.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxEnablePeerAccess,
+ * ::cuCtxDisablePeerAccess,
+ * ::cuDeviceCanAccessPeer,
+ * ::cudaDeviceGetP2PAttribute
+ */
+CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice);
+
+/** @} */ /* END CUDA_PEER_ACCESS */
+
+/**
+ * \defgroup CUDA_GRAPHICS Graphics Interoperability
+ *
+ * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graphics interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Unregisters a graphics resource for access by CUDA
+ *
+ * Unregisters the graphics resource \p resource so it is not accessible by
+ * CUDA unless registered again.
+ *
+ * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is
+ * returned.
+ *
+ * \param resource - Resource to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsD3D9RegisterResource,
+ * ::cuGraphicsD3D10RegisterResource,
+ * ::cuGraphicsD3D11RegisterResource,
+ * ::cuGraphicsGLRegisterBuffer,
+ * ::cuGraphicsGLRegisterImage,
+ * ::cudaGraphicsUnregisterResource
+ */
+CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource);
+
+/**
+ * \brief Get an array through which to access a subresource of a mapped graphics resource.
+ *
+ * Returns in \p *pArray an array through which the subresource of the mapped
+ * graphics resource \p resource which corresponds to array index \p arrayIndex
+ * and mipmap level \p mipLevel may be accessed.  The value set in \p *pArray may
+ * change every time that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via an array and
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
+ * If \p arrayIndex is not a valid array index for \p resource then
+ * ::CUDA_ERROR_INVALID_VALUE is returned.
+ * If \p mipLevel is not a valid mipmap level for \p resource then
+ * ::CUDA_ERROR_INVALID_VALUE is returned.
+ * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * \param pArray      - Returned array through which a subresource of \p resource may be accessed
+ * \param resource    - Mapped resource to access
+ * \param arrayIndex  - Array index for array textures or cubemap face
+ *                      index as defined by ::CUarray_cubemap_face for
+ *                      cubemap textures for the subresource to access
+ * \param mipLevel    - Mipmap level for the subresource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsSubResourceGetMappedArray
+ */
+CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
+
+/**
+ * \brief Get a mipmapped array through which to access a mapped graphics resource.
+ *
+ * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics
+ * resource \p resource. The value set in \p *pMipmappedArray may change every time
+ * that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via a mipmapped array and
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
+ * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed
+ * \param resource        - Mapped resource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsResourceGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
+
+/**
+ * \brief Get a device pointer through which to access a mapped graphics resource.
+ *
+ * Returns in \p *pDevPtr a pointer through which the mapped graphics resource
+ * \p resource may be accessed.
+ * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer.
+ * The value set in \p pPointer may change every time that \p resource is mapped.
+ *
+ * If \p resource is not a buffer then it cannot be accessed via a pointer and
+ * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned.
+ * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ * *
+ * \param pDevPtr    - Returned pointer through which \p resource may be accessed
+ * \param pSize      - Returned size of the buffer accessible starting at \p *pPointer
+ * \param resource   - Mapped resource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsResourceGetMappedPointer
+ */
+CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
+
+/**
+ * \brief Set usage flags for mapping a graphics resource
+ *
+ * Set \p flags for mapping the graphics resource \p resource.
+ *
+ * Changes to \p flags will take effect the next time \p resource is mapped.
+ * The \p flags argument may be any of the following:
+
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA kernels.  This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which
+ *   access this resource will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels
+ *   which access this resource will not read from this resource and will
+ *   write over the entire contents of the resource, so none of the data
+ *   previously stored in the resource will be preserved.
+ *
+ * If \p resource is presently mapped for access by CUDA then
+ * ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param resource - Registered resource to set flags for
+ * \param flags    - Parameters for resource mapping
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cudaGraphicsResourceSetMapFlags
+ */
+CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
+
+/**
+ * \brief Map graphics resources for access by CUDA
+ *
+ * Maps the \p count graphics resources in \p resources for access by CUDA.
+ *
+ * The resources in \p resources may be accessed by CUDA until they
+ * are unmapped. The graphics API from which \p resources were registered
+ * should not access any resources while they are mapped by CUDA. If an
+ * application does so, the results are undefined.
+ *
+ * This function provides the synchronization guarantee that any graphics calls
+ * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA
+ * work issued in \p stream begins.
+ *
+ * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ *
+ * \param count      - Number of resources to map
+ * \param resources  - Resources to map for CUDA usage
+ * \param hStream    - Stream with which to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_UNKNOWN
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsUnmapResources,
+ * ::cudaGraphicsMapResources
+ */
+CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+
+/**
+ * \brief Unmap graphics resources.
+ *
+ * Unmaps the \p count graphics resources in \p resources.
+ *
+ * Once unmapped, the resources in \p resources may not be accessed by CUDA
+ * until they are mapped again.
+ *
+ * This function provides the synchronization guarantee that any CUDA work issued
+ * in \p stream before ::cuGraphicsUnmapResources() will complete before any
+ * subsequently issued graphics work begins.
+ *
+ *
+ * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * \param count      - Number of resources to unmap
+ * \param resources  - Resources to unmap
+ * \param hStream    - Stream with which to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_UNKNOWN
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cudaGraphicsUnmapResources
+ */
+CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+
+/** @} */ /* END CUDA_GRAPHICS */
+
+/**
+ * \defgroup CUDA_DRIVER_ENTRY_POINT Driver Entry Point Access 
+ *
+ * ___MANBRIEF___ driver entry point access functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the driver entry point access functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the requested driver API function pointer
+ *
+ * Returns in \p **pfn the address of the CUDA driver function for the requested
+ * CUDA version and flags.
+ *
+ * The CUDA version is specified as (1000 * major + 10 * minor), so CUDA 11.2
+ * should be specified as 11020. For a requested driver symbol, if the specified
+ * CUDA version is greater than or equal to the CUDA version in which the driver symbol
+ * was introduced, this API will return the function pointer to the corresponding
+ * versioned function.
+ *
+ * The pointer returned by the API should be cast to a function pointer matching the
+ * requested driver function's definition in the API header file. The function pointer
+ * typedef can be picked up from the corresponding typedefs header file. For example,
+ * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h.
+ *
+ * The API will return ::CUDA_SUCCESS and set the returned \p pfn to NULL if the 
+ * requested driver function is not supported on the platform, no ABI 
+ * compatible driver function exists for the specified \p cudaVersion or if the 
+ * driver symbol is invalid.
+ *
+ * It will also set the optional \p symbolStatus to one of the values in
+ * ::CUdriverProcAddressQueryResult with the following meanings:
+ * - ::CU_GET_PROC_ADDRESS_SUCCESS - The requested symbol was succesfully found based
+ *   on input arguments and \p pfn is valid
+ * - ::CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND - The requested symbol was not found
+ * - ::CU_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT - The requested symbol was found but is
+ *   not supported by cudaVersion specified
+ *
+ * The requested flags can be:
+ * - ::CU_GET_PROC_ADDRESS_DEFAULT: This is the default mode. This is equivalent to
+ *   ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM if the code is compiled with
+ *   --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM
+ *   is defined; ::CU_GET_PROC_ADDRESS_LEGACY_STREAM otherwise.
+ * - ::CU_GET_PROC_ADDRESS_LEGACY_STREAM: This will enable the search for all driver symbols
+ *   that match the requested driver symbol name except the corresponding per-thread versions.
+ * - ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM: This will enable the search for all
+ *   driver symbols that match the requested driver symbol name including the per-thread
+ *   versions. If a per-thread version is not found, the API will return the legacy version
+ *   of the driver function.
+ *
+ * \param symbol - The base name of the driver API function to look for. As an example,
+ *                 for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc and
+ *                 \p cudaVersion would be the ABI compatible CUDA version for the _v2 variant. 
+ * \param pfn - Location to return the function pointer to the requested driver function
+ * \param cudaVersion - The CUDA version to look for the requested driver symbol 
+ * \param flags -  Flags to specify search options.
+ * \param symbolStatus - Optional location to store the status of the search for
+ *                       \p symbol based on \p cudaVersion. See ::CUdriverProcAddressQueryResult
+ *                       for possible values.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_version_mixing
+ *
+ * \sa
+ * ::cudaGetDriverEntryPoint
+ */
+CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags, CUdriverProcAddressQueryResult *symbolStatus);
+
+/** @} */ /* END CUDA_DRIVER_ENTRY_POINT */
+
+/**
+ * \defgroup CUDA_COREDUMP Coredump Attributes Control API
+ *
+ * ___MANBRIEF___ coredump attribute control functions for the low-level CUDA API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the coredump attribute control functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * Flags for choosing a coredump attribute to get/set
+ */
+typedef enum CUcoredumpSettings_enum {
+    CU_COREDUMP_ENABLE_ON_EXCEPTION = 1,
+    CU_COREDUMP_TRIGGER_HOST,
+    CU_COREDUMP_LIGHTWEIGHT,
+    CU_COREDUMP_ENABLE_USER_TRIGGER,
+    CU_COREDUMP_FILE,
+    CU_COREDUMP_PIPE,
+    CU_COREDUMP_GENERATION_FLAGS,
+    CU_COREDUMP_MAX
+} CUcoredumpSettings;
+
+/**
+ * Flags for controlling coredump contents
+ */
+typedef enum CUCoredumpGenerationFlags {
+    CU_COREDUMP_DEFAULT_FLAGS                = 0,
+    CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES = (1 << 0),
+    CU_COREDUMP_SKIP_GLOBAL_MEMORY           = (1 << 1),
+    CU_COREDUMP_SKIP_SHARED_MEMORY           = (1 << 2),
+    CU_COREDUMP_SKIP_LOCAL_MEMORY            = (1 << 3),
+    CU_COREDUMP_SKIP_ABORT                   = (1 << 4),
+    CU_COREDUMP_SKIP_CONSTBANK_MEMORY        = (1 << 5),
+
+    CU_COREDUMP_LIGHTWEIGHT_FLAGS = CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES
+                                     | CU_COREDUMP_SKIP_GLOBAL_MEMORY
+                                     | CU_COREDUMP_SKIP_SHARED_MEMORY
+                                     | CU_COREDUMP_SKIP_LOCAL_MEMORY
+                                     | CU_COREDUMP_SKIP_CONSTBANK_MEMORY
+} CUCoredumpGenerationFlags;
+
+/**
+ * \brief Allows caller to fetch a coredump attribute value for the current context
+ *
+ * Returns in \p *value the requested value specified by \p attrib. It is up to the caller
+ * to ensure that the data type and size of \p *value matches the request.
+ *
+ * If the caller calls this function with \p *value equal to NULL, the size of the memory
+ * region (in bytes) expected for \p attrib will be placed in \p size.
+ *
+ * The supported attributes are:
+ * - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from
+ *      this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
+ *      The default value is ::false unless set to ::true globally or locally, or the
+ *      CU_CTX_USER_COREDUMP_ENABLE flag was set during context creation.
+ * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
+ *      also create a coredump. The default value is ::true unless set to ::false globally or
+ *      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+ *      flag to disable host device abort() if needed.
+ * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
+ *      will not have a dump of GPU memory or non-reloc ELF images. The default value is
+ *      ::false unless set to ::true globally or locally. This attribute is deprecated as
+ *      of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS instead.
+ * - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
+ *      created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
+ *      value is ::false unless set to ::true globally or locally.
+ * - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
+ *      any coredumps generated by this context will be written. The default value is
+ *      ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+ *      the CUDA applications and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe
+ *      that will be monitored if user-triggered coredumps are enabled. The default value is
+ *      ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+ *      the CUDA application and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+ *      contained in a coredump specified as a bitwise OR combination of the following values:
+ *      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+ *          default settings of including all memory regions that it is able to access
+ *      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+ *          CUDA source modules that are not relocated at runtime.
+ *      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+ *          that does not belong to any context.
+ *      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+ *          for the warp that the dumped kernel belonged to.
+ *      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+ *      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+ *          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+ *      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+ *          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+ *          behavior.
+ *
+ * \param attrib - The enum defining which value to fetch.
+ * \param value - void* containing the requested data.
+ * \param size - The size of the memory region \p value points to.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ *
+ * \sa
+ * ::cuCoredumpGetAttributeGlobal,
+ * ::cuCoredumpSetAttribute,
+ * ::cuCoredumpSetAttributeGlobal
+ */
+CUresult CUDAAPI cuCoredumpGetAttribute(CUcoredumpSettings attrib, void* value, size_t *size);
+
+/**
+ * \brief Allows caller to fetch a coredump attribute value for the entire application
+ *
+ * Returns in \p *value the requested value specified by \p attrib. It is up to the caller
+ * to ensure that the data type and size of \p *value matches the request.
+ *
+ * If the caller calls this function with \p *value equal to NULL, the size of the memory
+ * region (in bytes) expected for \p attrib will be placed in \p size.
+ *
+ * The supported attributes are:
+ * - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from
+ *      this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
+ *      The default value is ::false.
+ * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
+ *      also create a coredump. The default value is ::true unless set to ::false globally or
+ *      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+ *      flag to disable host device abort() if needed.
+ * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
+ *      will not have a dump of GPU memory or non-reloc ELF images. The default value is
+ *      ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
+ *      instead.
+ * - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
+ *      created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
+ *      value is ::false.
+ * - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
+ *      any coredumps generated by this context will be written. The default value is
+ *      ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+ *      the CUDA applications and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe
+ *      that will be monitored if user-triggered coredumps are enabled. The default value is
+ *      ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+ *      the CUDA application and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+ *      contained in a coredump specified as a bitwise OR combination of the following values:
+ *      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+ *          default settings of including all memory regions that it is able to access
+ *      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+ *          CUDA source modules that are not relocated at runtime.
+ *      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+ *          that does not belong to any context.
+ *      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+ *          for the warp that the dumped kernel belonged to.
+ *      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+ *      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+ *          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+ *      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+ *          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+ *          behavior.
+ *
+ * \param attrib - The enum defining which value to fetch.
+ * \param value - void* containing the requested data.
+ * \param size - The size of the memory region \p value points to.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuCoredumpGetAttribute,
+ * ::cuCoredumpSetAttribute,
+ * ::cuCoredumpSetAttributeGlobal
+ */
+CUresult CUDAAPI cuCoredumpGetAttributeGlobal(CUcoredumpSettings attrib, void *value, size_t *size);
+
+/**
+ * \brief Allows caller to set a coredump attribute value for the current context
+ *
+ * This function should be considered an alternate interface to the CUDA-GDB environment
+ * variables defined in this document: https://docs.nvidia.com/cuda/cuda-gdb/index.html#gpu-coredump
+ *
+ * An important design decision to note is that any coredump environment variable values
+ * set before CUDA initializes will take permanent precedence over any values set with this
+ * function. This decision was made to ensure no change in behavior for any users that
+ * may be currently using these variables to get coredumps.
+ *
+ * \p *value shall contain the requested value specified by \p set. It is up to the caller
+ * to ensure that the data type and size of \p *value matches the request.
+ *
+ * If the caller calls this function with \p *value equal to NULL, the size of the memory
+ * region (in bytes) expected for \p set will be placed in \p size.
+ *
+ * /note This function will return ::CUDA_ERROR_NOT_SUPPORTED if the caller attempts to set
+ * ::CU_COREDUMP_ENABLE_ON_EXCEPTION on a GPU of with Compute Capability < 6.0. ::cuCoredumpSetAttributeGlobal
+ * works on those platforms as an alternative.
+ *
+ * /note ::CU_COREDUMP_ENABLE_USER_TRIGGER and ::CU_COREDUMP_PIPE cannot be set on a per-context basis.
+ *
+ * The supported attributes are:
+ * - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from
+ *      this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
+ *      The default value is ::false.
+ * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
+ *      also create a coredump. The default value is ::true unless set to ::false globally or
+ *      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+ *      flag to disable host device abort() if needed.
+ * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
+ *      will not have a dump of GPU memory or non-reloc ELF images. The default value is
+ *      ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
+ *      instead.
+ * - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
+ *      any coredumps generated by this context will be written. The default value is
+ *      ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+ *      the CUDA applications and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+ *      contained in a coredump specified as a bitwise OR combination of the following values:
+ *      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+ *          default settings of including all memory regions that it is able to access
+ *      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+ *          CUDA source modules that are not relocated at runtime.
+ *      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+ *          that does not belong to any context.
+ *      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+ *          for the warp that the dumped kernel belonged to.
+ *      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+ *      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+ *          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+ *      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+ *          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+ *          behavior.
+ *
+ * \param attrib - The enum defining which value to set.
+ * \param value - void* containing the requested data.
+ * \param size - The size of the memory region \p value points to.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa
+ * ::cuCoredumpGetAttributeGlobal,
+ * ::cuCoredumpGetAttribute,
+ * ::cuCoredumpSetAttributeGlobal
+ */
+CUresult CUDAAPI cuCoredumpSetAttribute(CUcoredumpSettings attrib, void* value, size_t *size);
+
+/**
+ * \brief Allows caller to set a coredump attribute value globally
+ *
+ * This function should be considered an alternate interface to the CUDA-GDB environment
+ * variables defined in this document: https://docs.nvidia.com/cuda/cuda-gdb/index.html#gpu-coredump
+ *
+ * An important design decision to note is that any coredump environment variable values
+ * set before CUDA initializes will take permanent precedence over any values set with this
+ * function. This decision was made to ensure no change in behavior for any users that
+ * may be currently using these variables to get coredumps.
+ *
+ * \p *value shall contain the requested value specified by \p set. It is up to the caller
+ * to ensure that the data type and size of \p *value matches the request.
+ *
+ * If the caller calls this function with \p *value equal to NULL, the size of the memory
+ * region (in bytes) expected for \p set will be placed in \p size.
+ *
+ * The supported attributes are:
+ * - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from
+ *      this context will create a coredump at the location specified by ::CU_COREDUMP_FILE.
+ *      The default value is ::false.
+ * - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will
+ *      also create a coredump. The default value is ::true unless set to ::false globally or
+ *      or locally. This value is deprecated as of CUDA 12.5 - raise the ::CU_COREDUMP_SKIP_ABORT
+ *      flag to disable host device abort() if needed.
+ * - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps
+ *      will not have a dump of GPU memory or non-reloc ELF images. The default value is
+ *      ::false. This attribute is deprecated as of CUDA 12.5, please use ::CU_COREDUMP_GENERATION_FLAGS
+ *      instead.
+ * - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be
+ *      created by writing to the system pipe specified by ::CU_COREDUMP_PIPE. The default
+ *      value is ::false.
+ * - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where
+ *      any coredumps generated by this context will be written. The default value is
+ *      ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running
+ *      the CUDA applications and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe
+ *      that will be monitored if user-triggered coredumps are enabled. This value may not be
+ *      changed after ::CU_COREDUMP_ENABLE_USER_TRIGGER is set to ::true. The default
+ *      value is ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine
+ *      running the CUDA application and ::PID is the process ID of the CUDA application.
+ * - ::CU_COREDUMP_GENERATION_FLAGS: An integer with values to allow granular control the data
+ *      contained in a coredump specified as a bitwise OR combination of the following values:
+ *      + ::CU_COREDUMP_DEFAULT_FLAGS - if set by itself, coredump generation returns to its
+ *          default settings of including all memory regions that it is able to access
+ *      + ::CU_COREDUMP_SKIP_NONRELOCATED_ELF_IMAGES - Coredump will not include the data from
+ *          CUDA source modules that are not relocated at runtime.
+ *      + ::CU_COREDUMP_SKIP_GLOBAL_MEMORY - Coredump will not include device-side global data
+ *          that does not belong to any context.
+ *      + ::CU_COREDUMP_SKIP_SHARED_MEMORY - Coredump will not include grid-scale shared memory
+ *          for the warp that the dumped kernel belonged to.
+ *      + ::CU_COREDUMP_SKIP_LOCAL_MEMORY - Coredump will not include local memory from the kernel.
+ *      + ::CU_COREDUMP_LIGHTWEIGHT_FLAGS - Enables all of the above options. Equiavlent to setting
+ *          the ::CU_COREDUMP_LIGHTWEIGHT attribute to ::true.
+ *      + ::CU_COREDUMP_SKIP_ABORT - If set, GPU exceptions will not raise an abort() in the host CPU
+ *          process. Same functional goal as ::CU_COREDUMP_TRIGGER_HOST but better reflects the default
+ *          behavior.
+ *
+ * \param attrib - The enum defining which value to set.
+ * \param value - void* containing the requested data.
+ * \param size - The size of the memory region \p value points to.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_PERMITTED
+ *
+ * \sa
+ * ::cuCoredumpGetAttribute,
+ * ::cuCoredumpGetAttributeGlobal,
+ * ::cuCoredumpSetAttribute
+ */
+CUresult CUDAAPI cuCoredumpSetAttributeGlobal(CUcoredumpSettings attrib, void *value, size_t *size);
+
+/** @} */ /* END CUDA_COREDUMP */
+
+CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
+
+/*
+** ******************* GREEN CONTEXTS **********************
+*/
+
+/**
+ * \defgroup CUDA_GREEN_CONTEXTS Green Contexts
+ *
+ * ___MANBRIEF___ Driver level API for creation and manipulation of green contexts
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the APIs for creation and manipulation of green contexts in the CUDA
+ * driver. Green contexts are a lightweight alternative to traditional contexts, with the ability
+ * to pass in a set of resources that they should be initialized with. This allows the developer to
+ * represent distinct spatial partitions of the GPU, provision resources for them, and target them
+ * via the same programming model that CUDA exposes (streams, kernel launches, etc.).
+ *
+ * There are 4 main steps to using these new set of APIs.
+ * - (1) Start with an initial set of resources, for example via ::cuDeviceGetDevResource. Only SM type is supported today.
+ * - (2) Partition this set of resources by providing them as input to a partition API, for example: ::cuDevSmResourceSplitByCount.
+ * - (3) Finalize the specification of resources by creating a descriptor via ::cuDevResourceGenerateDesc.
+ * - (4) Provision the resources and create a green context via ::cuGreenCtxCreate.
+ *
+ * For \p CU_DEV_RESOURCE_TYPE_SM, the partitions created have minimum SM count requirements, often rounding up and aligning the
+ * minCount provided to ::cuDevSmResourceSplitByCount. The following is a guideline for each architecture
+ * and may be subject to change:
+ * - On Compute Architecture 6.X: The minimum count is 1 SM.
+ * - On Compute Architecture 7.X: The minimum count is 2 SMs and must be a multiple of 2.
+ * - On Compute Architecture 8.X: The minimum count is 4 SMs and must be a multiple of 2.
+ * - On Compute Architecture 9.0+: The minimum count is 8 SMs and must be a multiple of 8.
+ *
+ * In the future, flags can be provided to tradeoff functional and performance characteristics versus finer grained SM partitions.
+ *
+ * Even if the green contexts have disjoint SM partitions, it is not guaranteed that the kernels launched
+ * in them will run concurrently or have forward progress guarantees. This is due to other resources (like HW connections,
+ * see ::CUDA_DEVICE_MAX_CONNECTIONS) that could cause a dependency. Additionally, in certain scenarios,
+ * it is possible for the workload to run on more SMs than was provisioned (but never less).
+ * The following are two scenarios which can exhibit this behavior:
+ * - On Volta+ MPS: When \p CUDA_MPS_ACTIVE_THREAD_PERCENTAGE is used,
+ * the set of SMs that are used for running kernels can be scaled up to the value of SMs used for the MPS client.
+ * - On Compute Architecture 9.x: When a module with dynamic parallelism (CDP) is loaded, all future
+ * kernels running under green contexts may use and share an additional set of 2 SMs.
+ *
+ * @{
+ */
+
+/*!
+ * \typedef struct CUdevResourceDesc_st* CUdevResourceDesc;
+ * An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources.
+ * Created via ::cuDevResourceGenerateDesc
+ */
+typedef struct CUdevResourceDesc_st *CUdevResourceDesc;
+
+typedef enum {
+    CU_GREEN_CTX_DEFAULT_STREAM = 0x1, /**< Required. Creates a default stream to use inside the green context */
+} CUgreenCtxCreate_flags;
+
+typedef enum {
+    CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING = 0x1,
+    CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE = 0x2,
+} CUdevSmResourceSplit_flags;
+
+#define RESOURCE_ABI_VERSION 1
+#define RESOURCE_ABI_EXTERNAL_BYTES 48
+
+#define _CONCAT_INNER(x, y) x ## y
+#define _CONCAT_OUTER(x, y) _CONCAT_INNER(x, y)
+
+/*!
+ * \typedef enum CUdevResourceType
+ * Type of resource
+ */
+typedef enum {
+    CU_DEV_RESOURCE_TYPE_INVALID = 0,
+    CU_DEV_RESOURCE_TYPE_SM = 1, /**< Streaming multiprocessors related information */
+#if defined(__CUDA_API_VERSION_INTERNAL) && !defined(__CUDA_API_VERSION_INTERNAL_ODR)
+    CU_DEV_RESOURCE_TYPE_MAX,
+#endif
+} CUdevResourceType;
+
+/*!
+ * \struct CUdevSmResource
+ * Data for SM-related resources
+ */
+typedef struct CUdevSmResource_st {
+    unsigned int smCount; /**< The amount of streaming multiprocessors available in this resource. This is an output parameter only, do not write to this field. */
+} CUdevSmResource;
+
+/*!
+ * \struct CUdevResource
+ * A tagged union describing different resources identified by the type field. This structure should not be directly modified outside of the API that created it.
+ * \code
+ * struct {
+ *     CUdevResourceType type;
+ *     union {
+ *         CUdevSmResource sm;
+ *     };
+ * };
+ * \endcode
+ * - If \p type is \p CU_DEV_RESOURCE_TYPE_INVALID, this resoure is not valid and cannot be further accessed.
+ * - If \p type is \p CU_DEV_RESOURCE_TYPE_SM, the ::CUdevSmResource structure \p sm is filled in. For example,
+ * \p sm.smCount will reflect the amount of streaming multiprocessors available in this resource.
+ */
+typedef struct CUdevResource_st {
+    CUdevResourceType type; /**< Type of resource, dictates which union field was last set */
+    unsigned char _internal_padding[92];
+    union {
+        CUdevSmResource sm; /**< Resource corresponding to CU_DEV_RESOURCE_TYPE_SM \p. type. */
+        unsigned char _oversize[RESOURCE_ABI_EXTERNAL_BYTES];
+    };
+} _CONCAT_OUTER(CUdevResource_v, RESOURCE_ABI_VERSION);
+typedef _CONCAT_OUTER(CUdevResource_v, RESOURCE_ABI_VERSION) CUdevResource;
+
+#undef _CONCAT_INNER
+#undef _CONCAT_OUTER
+
+#undef ABI_PER_RESOURCE_EXTERNAL_BYTES
+#undef ABI_RESOURCE_VERSION
+
+/**
+ * \brief Creates a green context with a specified set of resources.
+ *
+ * This API creates a green context with the resources specified in the descriptor \p desc and
+ * returns it in the handle represented by \p phCtx. This API will retain the primary context on device \p dev,
+ * which will is released when the green context is destroyed. It is advised to have the primary context active
+ * before calling this API to avoid the heavy cost of triggering primary context initialization and
+ * deinitialization multiple times.
+ *
+ * The API does not set the green context current. In order to set it current, you need to explicitly set it current
+ * by first converting the green context to a CUcontext using ::cuCtxFromGreenCtx and subsequently calling
+ * ::cuCtxSetCurrent / ::cuCtxPushCurrent. It should be noted that a green context can be current to only one
+ * thread at a time. There is no internal synchronization to make API calls accessing the same green context
+ * from multiple threads work.
+ *
+ * Note: The API is not supported on 32-bit platforms.
+ *
+ * \param phCtx - Pointer for the output handle to the green context
+ * \param desc - Descriptor generated via ::cuDevResourceGenerateDesc which contains the set of resources to be used
+ * \param dev - Device on which to create the green context.
+ * \param flags - One of the supported green context creation flags. \p CU_GREEN_CTX_DEFAULT_STREAM is required.
+ *
+ * The supported flags are:
+ * - \p CU_GREEN_CTX_DEFAULT_STREAM : Creates a default stream to use inside the green context. Required.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa
+ * ::cuGreenCtxDestroy,
+ * ::cuCtxFromGreenCtx,
+ * ::cuCtxSetCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuDevResourceGenerateDesc,
+ * ::cuDevicePrimaryCtxRetain,
+ * ::cuCtxCreate,
+ * ::cuCtxCreate_v3
+ */
+CUresult CUDAAPI cuGreenCtxCreate(CUgreenCtx* phCtx, CUdevResourceDesc desc, CUdevice dev, unsigned int flags);
+
+/**
+ * \brief Destroys a green context
+ *
+ * Destroys the green context, releasing the primary context of the device that this green context was created for.
+ * Any resources provisioned for this green context (that were initially available via the resource descriptor)
+ * are released as well.
+ * \param hCtx - Green context to be destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ *
+ * \sa
+ * ::cuGreenCtxCreate,
+ * ::cuCtxDestroy
+ */
+CUresult CUDAAPI cuGreenCtxDestroy(CUgreenCtx hCtx);
+
+/**
+ * \brief Converts a green context into the primary context
+ *
+ * The API converts a green context into the primary context returned in \p pContext. It is important
+ * to note that the converted context \p pContext is a normal primary context but with
+ * the resources of the specified green context \p hCtx. Once converted, it can then
+ * be used to set the context current with ::cuCtxSetCurrent or with any of the CUDA APIs
+ * that accept a CUcontext parameter.
+ *
+ * Users are expected to call this API before calling any CUDA APIs that accept a
+ * CUcontext. Failing to do so will result in the APIs returning ::CUDA_ERROR_INVALID_CONTEXT.
+ *
+ * \param pContext Returned primary context with green context resources
+ * \param hCtx Green context to convert
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuGreenCtxCreate
+ */
+CUresult CUDAAPI cuCtxFromGreenCtx(CUcontext *pContext, CUgreenCtx hCtx);
+
+/**
+ * \brief Get device resources
+ *
+ * Get the \p type resources available to the \p device.
+ * This may often be the starting point for further partitioning or configuring of resources.
+ *
+ * Note: The API is not supported on 32-bit platforms.
+ *
+ * \param device - Device to get resource for
+ * \param resource - Output pointer to a CUdevResource structure
+ * \param type - Type of resource to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_RESOURCE_TYPE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa
+ * ::cuDevResourceGenerateDesc
+ */
+CUresult CUDAAPI cuDeviceGetDevResource(CUdevice device, CUdevResource* resource, CUdevResourceType type);
+
+/**
+ * \brief Get context resources
+ *
+ * Get the \p type resources available to the context represented by \p hCtx
+ * \param hCtx - Context to get resource for
+ *
+ * Note: The API is not supported on 32-bit platforms.
+ *
+ * \param resource - Output pointer to a CUdevResource structure
+ * \param type - Type of resource to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_RESOURCE_TYPE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ *
+ * \sa
+ * ::cuDevResourceGenerateDesc
+ */
+CUresult CUDAAPI cuCtxGetDevResource(CUcontext hCtx, CUdevResource* resource, CUdevResourceType type);
+
+/**
+ * \brief Get green context resources
+ *
+ * Get the \p type resources available to the green context represented by \p hCtx
+ * \param hCtx - Green context to get resource for
+ * \param resource - Output pointer to a CUdevResource structure
+ * \param type - Type of resource to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_RESOURCE_TYPE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuDevResourceGenerateDesc
+ */
+CUresult CUDAAPI cuGreenCtxGetDevResource(CUgreenCtx hCtx, CUdevResource* resource, CUdevResourceType type);
+
+/**
+ * \brief Splits \p CU_DEV_RESOURCE_TYPE_SM resources.
+ *
+ * Splits \p CU_DEV_RESOURCE_TYPE_SM resources into \p nbGroups, adhering to the minimum SM count specified in \p minCount
+ * and the usage flags in \p useFlags. If \p result is NULL, the API simulates a split and provides the amount of groups that
+ * would be created in \p nbGroups. Otherwise, \p nbGroups must point to the amount of elements in \p result and on return,
+ * the API will overwrite \p nbGroups with the amount actually created. The groups are written to the array in \p result.
+ * \p nbGroups can be less than the total amount if a smaller number of groups is needed.
+ *
+ * This API is used to spatially partition the input resource. The input resource needs to come from one of
+ * ::cuDeviceGetDevResource, ::cuCtxGetDevResource, or ::cuGreenCtxGetDevResource.
+ * A limitation of the API is that the output results cannot be split again without
+ * first creating a descriptor and a green context with that descriptor.
+ *
+ * When creating the groups, the API will take into account the performance and functional characteristics of the
+ * input resource, and guarantee a split that will create a disjoint set of symmetrical partitions. This may lead to fewer groups created
+ * than purely dividing the total SM count by the \p minCount due to cluster requirements or
+ * alignment and granularity requirements for the minCount.
+ *
+ * The \p remainder set does not have the same functional or performance guarantees as the groups in \p result.
+ * Its use should be carefully planned and future partitions of the \p remainder set are discouraged.
+ *
+ * The following flags are supported:
+ * - \p CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING : Lower the minimum SM count and alignment, and treat each SM independent of its hierarchy.
+ *  This allows more fine grained partitions but at the cost of advanced features (such as large clusters on compute capability 9.0+).
+ * - \p CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE : Compute Capability 9.0+ only. Attempt to create groups that may allow
+ *  for maximally sized thread clusters. This can be queried post green context creation using ::cuOccupancyMaxPotentialClusterSize.
+ *
+ * A successful API call must either have:
+ * - A valid array of \p result pointers of size passed in \p nbGroups, with \p input of type \p CU_DEV_RESOURCE_TYPE_SM.
+ * Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining may be NULL.
+ * - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p input of type \p CU_DEV_RESOURCE_TYPE_SM.
+ * Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining may be NULL.
+ * This queries the number of groups that would be created by the API.
+ *
+ * Note: The API is not supported on 32-bit platforms.
+ *
+ * \param result - Output array of \p CUdevResource resources. Can be NULL to query the number of groups.
+ * \param nbGroups - This is a pointer, specifying the number of groups that would be or should be created as described below.
+ * \param input - Input SM resource to be split. Must be a valid \p CU_DEV_RESOURCE_TYPE_SM resource.
+ * \param remaining - If the input resource cannot be cleanly split among \p nbGroups, the remaining is placed in here.
+ * Can be ommitted (NULL) if the user does not need the remaining set.
+ * \param useFlags - Flags specifying how these partitions are used or which constraints to abide by when splitting the input. Zero is valid for default behavior.
+ * \param minCount - Minimum number of SMs required
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_RESOURCE_TYPE,
+ * ::CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION
+ *
+ * \sa
+ * ::cuGreenCtxGetDevResource,
+ * ::cuCtxGetDevResource,
+ * ::cuDeviceGetDevResource
+ */
+CUresult CUDAAPI cuDevSmResourceSplitByCount(
+    CUdevResource* result, unsigned int* nbGroups, const CUdevResource* input, CUdevResource* remaining, unsigned int useFlags, unsigned int minCount);
+
+/**
+ * \brief Generate a resource descriptor
+ *
+ * Generates a single resource descriptor with the set of resources specified in \p resources.
+ * The generated resource descriptor is necessary for the creation of green contexts via the ::cuGreenCtxCreate API.
+ * Resources of the same type can be passed in, provided they meet the requirements as noted below.
+ *
+ * A successful API call must have:
+ * - A valid output pointer for the \p phDesc descriptor as well as a valid array of \p resources pointers,
+ * with the array size passed in \p nbResources.
+ * If multiple resources are provided in \p resources, the device they came from must be the same,
+ * otherwise CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned.
+ * If multiple resources are provided in \p resources and they are of type ::CU_DEV_RESOURCE_TYPE_SM,
+ * they must be outputs (whether \p result or \p remaining) from the same split API instance,
+ * otherwise CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned.
+ *
+ * Note: The API is not supported on 32-bit platforms.
+ *
+ * \param phDesc - Output descriptor
+ * \param resources - Array of resources to be included in the descriptor
+ * \param nbResources - Number of resources passed in \p resources
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_RESOURCE_TYPE,
+ * ::CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION
+ *
+ * \sa
+ * ::cuDevSmResourceSplitByCount
+ */
+CUresult CUDAAPI cuDevResourceGenerateDesc(CUdevResourceDesc *phDesc, CUdevResource *resources, unsigned int nbResources);
+
+/**
+ * \brief Records an event.
+ *
+ * Captures in \p hEvent all the activities of the green context of \p hCtx
+ * at the time of this call. \p hEvent and \p hCtx must be from the same
+ * primary context otherwise ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * Calls such as ::cuEventQuery() or ::cuGreenCtxWaitEvent() will
+ * then examine or wait for completion of the work that was captured. Uses of
+ * \p hCtx after this call do not modify \p hEvent.
+ *
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED if the
+ * specified green context \p hCtx has a stream in the capture mode. In such
+ * a case, the call will invalidate all the conflicting captures.
+ *
+ * \param hCtx - Green context to record event for
+ * \param hEvent  - Event to record
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
+ *
+ * \sa
+ * ::cuGreenCtxWaitEvent,
+ * ::cuEventRecord,
+ * ::cuCtxRecordEvent,
+ * ::cuCtxWaitEvent
+ */
+CUresult CUDAAPI cuGreenCtxRecordEvent(CUgreenCtx hCtx, CUevent hEvent);
+
+/**
+ * \brief Make a green context wait on an event
+ *
+ * Makes all future work submitted to green context \p hCtx wait for all work
+ * captured in \p hEvent. The synchronization will be performed on the device
+ * and will not block the calling CPU thread. See ::cuGreenCtxRecordEvent()
+ * or ::cuEventRecord(), for details on what is captured by an event.
+ *
+ * \note \p hEvent may be from a different context or device than \p hCtx.
+ *
+ * \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED and
+ * invalidate the capture if the specified event \p hEvent is part of an
+ * ongoing capture sequence or if the specified green context \p hCtx has
+ * a stream in the capture mode.
+ *
+ * \param hCtx    - Green context to wait
+ * \param hEvent  - Event to wait on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED
+ *
+ * \sa
+ * ::cuGreenCtxRecordEvent,
+ * ::cuStreamWaitEvent
+ * ::cuCtxRecordEvent,
+ * ::cuCtxWaitEvent
+*/
+CUresult CUDAAPI cuGreenCtxWaitEvent(CUgreenCtx hCtx, CUevent hEvent);
+
+/**
+ * \brief Query the green context associated with a stream
+ *
+ * Returns the CUDA green context that the stream is associated with, or NULL if the stream
+ * is not associated with any green context.
+ *
+ * The stream handle \p hStream can refer to any of the following:
+ * <ul>
+ *   <li>
+ *   a stream created via any of the CUDA driver APIs such as ::cuStreamCreate, ::cuStreamCreateWithPriority
+ *   and ::cuGreenCtxStreamCreate, or their runtime API equivalents such as
+ *   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
+ *   If during stream creation the context that was active in the calling thread was obtained
+ *   with cuCtxFromGreenCtx, that green context is returned in \p phCtx.
+ *   Otherwise, \p *phCtx is set to NULL instead.
+ *   </li>
+ *   <li>
+ *   special stream such as the NULL stream or ::CU_STREAM_LEGACY.
+ *   In that case if context that is active in the calling thread was obtained
+ *   with cuCtxFromGreenCtx, that green context is returned.
+ *   Otherwise, \p *phCtx is set to NULL instead.
+ *   </li>
+ * </ul>
+ * Passing an invalid handle will result in undefined behavior.
+ *
+ * \param hStream - Handle to the stream to be queried
+ * \param phCtx   - Returned green context associated with the stream
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuStreamCreateWithPriority,
+ * ::cuStreamGetCtx_v2,
+ * ::cuGreenCtxStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags
+ */
+CUresult CUDAAPI cuStreamGetGreenCtx(CUstream hStream, CUgreenCtx *phCtx);
+
+/**
+ * \brief Create a stream for use in the green context
+ *
+ * Creates a stream for use in the specified green context \p greenCtx and returns a handle in \p phStream.
+ * The stream can be destroyed by calling ::cuStreamDestroy(). Note that the API ignores the context that
+ * is current to the calling thread and creates a stream in the specified green context \p greenCtx.
+ *
+ * The supported values for \p flags are:
+ * - ::CU_STREAM_NON_BLOCKING: This must be specified. It indicates that work running in the created
+ *   stream may run concurrently with work in the default stream, and that
+ *   the created stream should perform no implicit synchronization with the default stream.
+ *
+ * Specifying \p priority affects the scheduling priority of work in the stream. Priorities provide a
+ * hint to preferentially run work with higher priority when possible, but do not preempt
+ * already-running work or provide any other functional guarantee on execution order.
+ * \p priority follows a convention where lower numbers represent higher priorities.
+ * '0' represents default priority. The range of meaningful numerical priorities can
+ * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is
+ * outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ * it will automatically be clamped to the lowest or the highest number in the range.
+ *
+ * \param phStream - Returned newly created stream
+ * \param greenCtx - Green context for which to create the stream for
+ * \param flags    - Flags for stream creation. \p CU_STREAM_NON_BLOCKING must be specified.
+ * \param priority - Stream priority. Lower numbers represent higher priorities.
+ *                   See ::cuCtxGetStreamPriorityRange for more information about
+ *                   meaningful stream priorities that can be passed.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \note In the current implementation, only compute kernels launched in
+ * priority streams are affected by the stream's priority. Stream priorities have
+ * no effect on host-to-device and device-to-host memory operations.
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuGreenCtxCreate
+ * ::cuStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuStreamGetFlags,
+ * ::cuStreamGetDevice
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreateWithPriority
+ */
+CUresult CUDAAPI cuGreenCtxStreamCreate(CUstream* phStream, CUgreenCtx greenCtx, unsigned int flags, int priority);
+
+/** @} */
+
+/*
+** *************** END CUDA_GREEN_CONTEXTS *****************
+*/
+
+/**
+ * CUDA API versioning support
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuMemHostRegister
+    #undef cuGraphicsResourceSetMapFlags
+    #undef cuLinkCreate
+    #undef cuLinkAddData
+    #undef cuLinkAddFile
+    #undef cuDeviceTotalMem
+    #undef cuCtxCreate
+    #undef cuModuleGetGlobal
+    #undef cuMemGetInfo
+    #undef cuMemAlloc
+    #undef cuMemAllocPitch
+    #undef cuMemFree
+    #undef cuMemGetAddressRange
+    #undef cuMemAllocHost
+    #undef cuMemHostGetDevicePointer
+    #undef cuMemcpyHtoD
+    #undef cuMemcpyDtoH
+    #undef cuMemcpyDtoD
+    #undef cuMemcpyDtoA
+    #undef cuMemcpyAtoD
+    #undef cuMemcpyHtoA
+    #undef cuMemcpyAtoH
+    #undef cuMemcpyAtoA
+    #undef cuMemcpyHtoAAsync
+    #undef cuMemcpyAtoHAsync
+    #undef cuMemcpy2D
+    #undef cuMemcpy2DUnaligned
+    #undef cuMemcpy3D
+    #undef cuMemcpyHtoDAsync
+    #undef cuMemcpyDtoHAsync
+    #undef cuMemcpyDtoDAsync
+    #undef cuMemcpy2DAsync
+    #undef cuMemcpy3DAsync
+    #undef cuMemcpyBatchAsync
+    #undef cuMemcpy3DBatchAsync
+    #undef cuMemsetD8
+    #undef cuMemsetD16
+    #undef cuMemsetD32
+    #undef cuMemsetD2D8
+    #undef cuMemsetD2D16
+    #undef cuMemsetD2D32
+    #undef cuArrayCreate
+    #undef cuArrayGetDescriptor
+    #undef cuArray3DCreate
+    #undef cuArray3DGetDescriptor
+    #undef cuTexRefSetAddress
+    #undef cuTexRefSetAddress2D
+    #undef cuTexRefGetAddress
+    #undef cuGraphicsResourceGetMappedPointer
+    #undef cuCtxDestroy
+    #undef cuCtxPopCurrent
+    #undef cuCtxPushCurrent
+    #undef cuStreamDestroy
+    #undef cuEventDestroy
+    #undef cuMemcpy
+    #undef cuMemcpyAsync
+    #undef cuMemcpyPeer
+    #undef cuMemcpyPeerAsync
+    #undef cuMemcpy3DPeer
+    #undef cuMemcpy3DPeerAsync
+    #undef cuMemsetD8Async
+    #undef cuMemsetD16Async
+    #undef cuMemsetD32Async
+    #undef cuMemsetD2D8Async
+    #undef cuMemsetD2D16Async
+    #undef cuMemsetD2D32Async
+    #undef cuStreamGetPriority
+    #undef cuStreamGetId
+    #undef cuStreamGetFlags
+    #undef cuStreamGetDevice
+    #undef cuStreamGetCtx
+    #undef cuStreamWaitEvent
+    #undef cuStreamAddCallback
+    #undef cuStreamAttachMemAsync
+    #undef cuStreamQuery
+    #undef cuStreamSynchronize
+    #undef cuEventRecord
+    #undef cuEventRecordWithFlags
+    #undef cuLaunchKernel
+    #undef cuLaunchKernelEx
+    #undef cuLaunchHostFunc
+    #undef cuGraphicsMapResources
+    #undef cuGraphicsUnmapResources
+    #undef cuStreamWriteValue32
+    #undef cuStreamWaitValue32
+    #undef cuStreamWriteValue64
+    #undef cuStreamWaitValue64
+    #undef cuStreamBatchMemOp
+    #undef cuStreamWriteValue32_v2
+    #undef cuStreamWaitValue32_v2
+    #undef cuStreamWriteValue64_v2
+    #undef cuStreamWaitValue64_v2
+    #undef cuStreamBatchMemOp_v2
+    #undef cuMemPrefetchAsync
+    #undef cuMemPrefetchAsync_v2
+    #undef cuLaunchCooperativeKernel
+    #undef cuSignalExternalSemaphoresAsync
+    #undef cuWaitExternalSemaphoresAsync
+    #undef cuStreamBeginCapture
+    #undef cuStreamBeginCaptureToGraph
+    #undef cuStreamEndCapture
+    #undef cuStreamIsCapturing
+    #undef cuStreamGetCaptureInfo
+    #undef cuStreamGetCaptureInfo_v2
+    #undef cuStreamGetCaptureInfo_v3
+    #undef cuGraphInstantiateWithParams
+    #undef cuGraphExecUpdate
+    #undef cuGraphUpload
+    #undef cuGraphLaunch
+    #undef cuDevicePrimaryCtxRelease
+    #undef cuDevicePrimaryCtxReset
+    #undef cuDevicePrimaryCtxSetFlags
+    #undef cuIpcOpenMemHandle
+    #undef cuStreamCopyAttributes
+    #undef cuStreamSetAttribute
+    #undef cuStreamGetAttribute
+    #undef cuGraphInstantiate
+    #undef cuGraphAddKernelNode
+    #undef cuGraphKernelNodeGetParams
+    #undef cuGraphKernelNodeSetParams
+    #undef cuGraphExecKernelNodeSetParams
+    #undef cuMemMapArrayAsync
+    #undef cuMemFreeAsync 
+    #undef cuMemAllocAsync 
+    #undef cuMemAllocFromPoolAsync 
+    #undef cuStreamUpdateCaptureDependencies
+    #undef cuStreamUpdateCaptureDependencies_v2
+    #undef cuGetProcAddress
+    #undef cuStreamGetCtx_v2
+    #undef cuMemBatchDecompressAsync
+
+    CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
+    CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
+    CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+    CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
+        unsigned int numOptions, CUjit_option *options, void **optionValues);
+    CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
+        unsigned int numOptions, CUjit_option *options, void **optionValues);
+    CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+
+    typedef unsigned int CUdeviceptr_v1;
+
+    typedef struct CUDA_MEMCPY2D_v1_st
+    {
+        unsigned int srcXInBytes;   /**< Source X in bytes */
+        unsigned int srcY;          /**< Source Y */
+        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+        const void *srcHost;        /**< Source host pointer */
+        CUdeviceptr_v1 srcDevice;   /**< Source device pointer */
+        CUarray srcArray;           /**< Source array reference */
+        unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+
+        unsigned int dstXInBytes;   /**< Destination X in bytes */
+        unsigned int dstY;          /**< Destination Y */
+        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+        void *dstHost;              /**< Destination host pointer */
+        CUdeviceptr_v1 dstDevice;   /**< Destination device pointer */
+        CUarray dstArray;           /**< Destination array reference */
+        unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+
+        unsigned int WidthInBytes;  /**< Width of 2D memory copy in bytes */
+        unsigned int Height;        /**< Height of 2D memory copy */
+    } CUDA_MEMCPY2D_v1;
+
+    typedef struct CUDA_MEMCPY3D_v1_st
+    {
+        unsigned int srcXInBytes;   /**< Source X in bytes */
+        unsigned int srcY;          /**< Source Y */
+        unsigned int srcZ;          /**< Source Z */
+        unsigned int srcLOD;        /**< Source LOD */
+        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+        const void *srcHost;        /**< Source host pointer */
+        CUdeviceptr_v1 srcDevice;   /**< Source device pointer */
+        CUarray srcArray;           /**< Source array reference */
+        void *reserved0;            /**< Must be NULL */
+        unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+        unsigned int srcHeight;     /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+        unsigned int dstXInBytes;   /**< Destination X in bytes */
+        unsigned int dstY;          /**< Destination Y */
+        unsigned int dstZ;          /**< Destination Z */
+        unsigned int dstLOD;        /**< Destination LOD */
+        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+        void *dstHost;              /**< Destination host pointer */
+        CUdeviceptr_v1 dstDevice;   /**< Destination device pointer */
+        CUarray dstArray;           /**< Destination array reference */
+        void *reserved1;            /**< Must be NULL */
+        unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+        unsigned int dstHeight;     /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+        unsigned int WidthInBytes;  /**< Width of 3D memory copy in bytes */
+        unsigned int Height;        /**< Height of 3D memory copy */
+        unsigned int Depth;         /**< Depth of 3D memory copy */
+    } CUDA_MEMCPY3D_v1;
+
+    typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st
+    {
+        unsigned int Width;         /**< Width of array */
+        unsigned int Height;        /**< Height of array */
+
+        CUarray_format Format;      /**< Array format */
+        unsigned int NumChannels;   /**< Channels per array element */
+    } CUDA_ARRAY_DESCRIPTOR_v1;
+
+    typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st
+    {
+        unsigned int Width;         /**< Width of 3D array */
+        unsigned int Height;        /**< Height of 3D array */
+        unsigned int Depth;         /**< Depth of 3D array */
+
+        CUarray_format Format;      /**< Array format */
+        unsigned int NumChannels;   /**< Channels per array element */
+        unsigned int Flags;         /**< Flags */
+    } CUDA_ARRAY3D_DESCRIPTOR_v1;
+
+    CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
+    CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
+    CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
+    CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total);
+    CUresult CUDAAPI cuMemAlloc(CUdeviceptr_v1 *dptr, unsigned int bytesize);
+    CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
+    CUresult CUDAAPI cuMemFree(CUdeviceptr_v1 dptr);
+    CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr);
+    CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
+    CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags);
+    CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D_v1 *pCopy);
+    CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D_v1 *pCopy);
+    CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D_v1 *pCopy);
+    CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD8(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N);
+    CUresult CUDAAPI cuMemsetD16(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N);
+    CUresult CUDAAPI cuMemsetD32(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N);
+    CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
+    CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
+    CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
+    CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray);
+    CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray);
+    CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes);
+    CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch);
+    CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr_v1 *pdptr, CUtexref hTexRef);
+    CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
+
+    CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
+    CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
+    CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
+    CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
+    CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
+    CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
+    CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
+    CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
+
+    CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy);
+    CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy);
+    CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy);
+    CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N);
+    CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N);
+    CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N);
+    CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+    CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+    CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+    CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
+    CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyBatchAsync(CUdeviceptr *dsts, CUdeviceptr *srcs, size_t *sizes, size_t count,
+                                        CUmemcpyAttributes *attrs, size_t *attrsIdxs, size_t numAttrs,
+                                        size_t *failIdx, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DBatchAsync(size_t numOps, CUDA_MEMCPY3D_BATCH_OP *opList,
+                                          size_t *failIdx, unsigned long long flags, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+
+    CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
+    CUresult CUDAAPI cuStreamGetId(CUstream hStream, unsigned long long *streamId);
+    CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
+    CUresult CUDAAPI cuStreamGetDevice(CUstream hStream, CUdevice *device);
+    CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
+    CUresult CUDAAPI cuStreamGetCtx_v2(CUstream hStream, CUcontext *pCtx, CUgreenCtx *pGreenCtx);
+    CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
+    CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+    CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
+    CUresult CUDAAPI cuStreamQuery(CUstream hStream);
+    CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
+    CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
+    CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags);
+    CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+    CUresult CUDAAPI cuLaunchKernelEx(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
+    CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
+    CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+    CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+    CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+
+    CUresult CUDAAPI cuStreamWriteValue32_ptsz(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue32_ptsz(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWriteValue64_ptsz(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue64_ptsz(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamBatchMemOp_ptsz(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+
+    CUresult CUDAAPI cuStreamWriteValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWriteValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamBatchMemOp_v2(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+    CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
+    CUresult CUDAAPI cuMemPrefetchAsync_v2(CUdeviceptr devPtr, size_t count, CUmemLocation location, unsigned int flags, CUstream hStream);
+    CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
+    CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+    CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+    CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream);
+    CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream);
+    CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode);
+    CUresult CUDAAPI cuStreamBeginCaptureToGraph(CUstream hStream, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUstreamCaptureMode mode);
+    CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
+    CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+    CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+    CUresult CUDAAPI cuStreamGetCaptureInfo_ptsz(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+    CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+    CUresult CUDAAPI cuStreamGetCaptureInfo_v3(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, const CUgraphEdgeData **edgeData_out, size_t *numDependencies_out);
+    CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+    CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+    CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+    CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+    CUresult CUDAAPI cuGraphInstantiateWithParams(CUgraphExec *phGraphExec, CUgraph hGraph, CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams);
+    CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out);
+    CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraph, CUstream hStream);
+    CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream);
+    CUresult CUDAAPI cuStreamCopyAttributes(CUstream dstStream, CUstream srcStream);
+    CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value);
+    CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *param);
+
+    CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
+    CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+    CUresult CUDAAPI cuGraphInstantiate_v2(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+
+    CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream);
+
+    CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream);
+    CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream);
+    CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+
+    CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+    CUresult CUDAAPI cuStreamUpdateCaptureDependencies_v2(CUstream hStream, CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags);
+
+    CUresult CUDAAPI cuMemBatchDecompressAsync(
+        CUmemDecompressParams *paramsArray,
+        size_t count,
+        unsigned int flags,
+        size_t *errorIndex,
+        CUstream stream
+    );
+
+    CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags);
+
+#elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
+static inline CUresult cuGetProcAddress_v2_ptsz(const char *symbol, void **funcPtr, int driverVersion, cuuint64_t flags, CUdriverProcAddressQueryResult *symbolStatus) {
+    const int procAddressMask = (CU_GET_PROC_ADDRESS_LEGACY_STREAM|
+                                 CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM);
+    if ((flags & procAddressMask) == 0) {
+        flags |= CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM;
+    }
+    return cuGetProcAddress_v2(symbol, funcPtr, driverVersion, flags, symbolStatus); 
+}
+#define cuGetProcAddress_v2 cuGetProcAddress_v2_ptsz
+#endif
+
+/**
+ * \defgroup CUDA_CHECKPOINT CUDA Checkpointing
+ *
+ * ___MANBRIEF___ CUDA checkpoint and restore functionality of the low-level
+ * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This sections describes the checkpoint and restore functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * The CUDA checkpoint and restore API's provide a way to save and restore GPU
+ * state for full process checkpoints when used with CPU side process
+ * checkpointing solutions. They can also be used to pause GPU work and suspend
+ * a CUDA process to allow other applications to make use of GPU resources.
+ *
+ * Checkpoint and restore capabilities are currently restricted to Linux.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the restore thread ID for a CUDA process
+ *
+ * Returns in \p *tid the thread ID of the CUDA restore thread for the process
+ * specified by \p pid.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param tid - Returned restore thread ID
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuCheckpointProcessGetRestoreThreadId(int pid, int *tid);
+
+/**
+ * \brief Returns the process state of a CUDA process
+ *
+ * Returns in \p *state the current state of the CUDA process specified by \p pid.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param state - Returned CUDA process state
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuCheckpointProcessGetState(int pid, CUprocessState *state);
+
+/**
+ * \brief Lock a running CUDA process
+ *
+ * Lock the CUDA process specified by \p pid which will block further CUDA API
+ * calls. Process must be in the RUNNING state in order to lock.
+ *
+ * Upon successful return the process will be in the LOCKED state.
+ *
+ * If timeoutMs is specified and the timeout is reached the process will be left
+ * in the RUNNING state upon return.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param args - Optional lock operation arguments
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * ::CUDA_ERROR_NOT_READY
+ */
+CUresult CUDAAPI cuCheckpointProcessLock(int pid, CUcheckpointLockArgs *args);
+
+/**
+ * \brief Checkpoint a CUDA process's GPU memory contents
+ *
+ * Checkpoints a CUDA process specified by \p pid that is in the LOCKED
+ * state. The GPU memory contents will be brought into host memory and all
+ * underlying references will be released. Process must be in the LOCKED state
+ * to checkpoint.
+ *
+ * Upon successful return the process will be in the CHECKPOINTED state.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param args - Optional checkpoint operation arguments
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuCheckpointProcessCheckpoint(int pid, CUcheckpointCheckpointArgs *args);
+
+/**
+ * \brief Restore a CUDA process's GPU memory contents from its last checkpoint
+ *
+ * Restores a CUDA process specified by \p pid from its last checkpoint. Process
+ * must be in the CHECKPOINTED state to restore.
+ *
+ * Upon successful return the process will be in the LOCKED state.
+ *
+ * CUDA process restore requires persistence mode to be enabled or ::cuInit to
+ * have been called before execution.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param args - Optional restore operation arguments
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa
+ * ::cuInit
+ */
+CUresult CUDAAPI cuCheckpointProcessRestore(int pid, CUcheckpointRestoreArgs *args);
+
+/**
+ * \brief Unlock a CUDA process to allow CUDA API calls
+ *
+ * Unlocks a process specified by \p pid allowing it to resume making CUDA API
+ * calls. Process must be in the LOCKED state.
+ *
+ * Upon successful return the process will be in the RUNNING state.
+ *
+ * \param pid - The process ID of the CUDA process
+ * \param args - Optional unlock operation arguments
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ * ::CUDA_ERROR_NOT_INITIALIZED
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuCheckpointProcessUnlock(int pid, CUcheckpointUnlockArgs *args);
+
+/** @} */ /* End CUDA_CHECKPOINT */
+
+#ifdef __cplusplus
+}
+#endif
+
+#if defined(__GNUC__)
+  #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT)
+    #pragma GCC visibility pop
+  #endif
+#endif
+
+#undef __CUDA_DEPRECATED
+
+#endif /* __cuda_cuda_h__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaEGL.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaEGL.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3578faa0304289cdef811af509eded71691352a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaEGL.h
@@ -0,0 +1,662 @@
+/*
+ * Copyright 2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAEGL_H
+#define CUDAEGL_H
+
+#include "cuda.h"
+#include "EGL/egl.h"
+#include "EGL/eglext.h"
+
+
+#ifdef CUDA_FORCE_API_VERSION
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+  * \addtogroup CUDA_TYPES
+  * @{
+  */
+
+/**
+ * Maximum number of planes per frame
+ */
+#define MAX_PLANES 3
+
+/**
+  * CUDA EglFrame type - array or pointer
+  */
+typedef enum CUeglFrameType_enum {
+    CU_EGL_FRAME_TYPE_ARRAY = 0,  /**< Frame type CUDA array */
+    CU_EGL_FRAME_TYPE_PITCH = 1,  /**< Frame type pointer */
+} CUeglFrameType;
+
+/**
+ * Indicates that timeout for ::cuEGLStreamConsumerAcquireFrame is infinite.
+ */
+#define CUDA_EGL_INFINITE_TIMEOUT 0xFFFFFFFF
+
+/**
+ * Resource location flags- sysmem or vidmem
+ *
+ * For CUDA context on iGPU, since video and system memory are equivalent -
+ * these flags will not have an effect on the execution.
+ *
+ * For CUDA context on dGPU, applications can use the flag ::CUeglResourceLocationFlags
+ * to give a hint about the desired location.
+ *
+ * ::CU_EGL_RESOURCE_LOCATION_SYSMEM - the frame data is made resident on the system memory
+ * to be accessed by CUDA.
+ *
+ * ::CU_EGL_RESOURCE_LOCATION_VIDMEM - the frame data is made resident on the dedicated
+ * video memory to be accessed by CUDA.
+ *
+ * There may be an additional latency due to new allocation and data migration,
+ * if the frame is produced on a different memory.
+
+  */
+typedef enum CUeglResourceLocationFlags_enum {
+    CU_EGL_RESOURCE_LOCATION_SYSMEM   = 0x00,       /**< Resource location sysmem */
+    CU_EGL_RESOURCE_LOCATION_VIDMEM   = 0x01        /**< Resource location vidmem */
+} CUeglResourceLocationFlags;
+
+/**
+  * CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
+  * Three channel formats are currently not supported for ::CU_EGL_FRAME_TYPE_ARRAY
+  */
+typedef enum CUeglColorFormat_enum {
+    CU_EGL_COLOR_FORMAT_YUV420_PLANAR              = 0x00,  /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR          = 0x01,  /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
+    CU_EGL_COLOR_FORMAT_YUV422_PLANAR              = 0x02,  /**< Y, U, V  each in a separate  surface, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR          = 0x03,  /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
+    CU_EGL_COLOR_FORMAT_RGB                        = 0x04,  /**< R/G/B three channels in one surface with BGR byte ordering. Only pitch linear format supported. */
+    CU_EGL_COLOR_FORMAT_BGR                        = 0x05,  /**< R/G/B three channels in one surface with RGB byte ordering. Only pitch linear format supported. */
+    CU_EGL_COLOR_FORMAT_ARGB                       = 0x06,  /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
+    CU_EGL_COLOR_FORMAT_RGBA                       = 0x07,  /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
+    CU_EGL_COLOR_FORMAT_L                          = 0x08,  /**< single luminance channel in one surface. */
+    CU_EGL_COLOR_FORMAT_R                          = 0x09,  /**< single color channel in one surface. */
+    CU_EGL_COLOR_FORMAT_YUV444_PLANAR              = 0x0A,  /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV444_SEMIPLANAR          = 0x0B,  /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
+    CU_EGL_COLOR_FORMAT_YUYV_422                   = 0x0C,  /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
+    CU_EGL_COLOR_FORMAT_UYVY_422                   = 0x0D,  /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
+    CU_EGL_COLOR_FORMAT_ABGR                       = 0x0E,  /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
+    CU_EGL_COLOR_FORMAT_BGRA                       = 0x0F,  /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
+    CU_EGL_COLOR_FORMAT_A                          = 0x10,  /**< Alpha color format - one channel in one surface. */
+    CU_EGL_COLOR_FORMAT_RG                         = 0x11,  /**< R/G color format - two channels in one surface with GR byte ordering */
+    CU_EGL_COLOR_FORMAT_AYUV                       = 0x12,  /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
+    CU_EGL_COLOR_FORMAT_YVU444_SEMIPLANAR          = 0x13,  /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU422_SEMIPLANAR          = 0x14,  /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR          = 0x15,  /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR   = 0x16,  /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR   = 0x17,  /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR   = 0x18,  /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR   = 0x19,  /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_VYUY_ER                    = 0x1A,  /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
+    CU_EGL_COLOR_FORMAT_UYVY_ER                    = 0x1B,  /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
+    CU_EGL_COLOR_FORMAT_YUYV_ER                    = 0x1C,  /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
+    CU_EGL_COLOR_FORMAT_YVYU_ER                    = 0x1D,  /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
+    CU_EGL_COLOR_FORMAT_YUV_ER                     = 0x1E,  /**< Extended Range Y, U, V three channels in one surface, interleaved as VUY. Only pitch linear format supported. */
+    CU_EGL_COLOR_FORMAT_YUVA_ER                    = 0x1F,  /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
+    CU_EGL_COLOR_FORMAT_AYUV_ER                    = 0x20,  /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
+    CU_EGL_COLOR_FORMAT_YUV444_PLANAR_ER           = 0x21,  /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV422_PLANAR_ER           = 0x22,  /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_PLANAR_ER           = 0x23,  /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YUV444_SEMIPLANAR_ER       = 0x24,  /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV422_SEMIPLANAR_ER       = 0x25,  /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_ER       = 0x26,  /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU444_PLANAR_ER           = 0x27,  /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU422_PLANAR_ER           = 0x28,  /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_PLANAR_ER           = 0x29,  /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU444_SEMIPLANAR_ER       = 0x2A,  /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU422_SEMIPLANAR_ER       = 0x2B,  /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_ER       = 0x2C,  /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_BAYER_RGGB                 = 0x2D,  /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_BGGR                 = 0x2E,  /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_GRBG                 = 0x2F,  /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_GBRG                 = 0x30,  /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER10_RGGB               = 0x31,  /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER10_BGGR               = 0x32,  /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER10_GRBG               = 0x33,  /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER10_GBRG               = 0x34,  /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_RGGB               = 0x35,  /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_BGGR               = 0x36,  /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_GRBG               = 0x37,  /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_GBRG               = 0x38,  /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER14_RGGB               = 0x39,  /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER14_BGGR               = 0x3A,  /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER14_GRBG               = 0x3B,  /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER14_GBRG               = 0x3C,  /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER20_RGGB               = 0x3D,  /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER20_BGGR               = 0x3E,  /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER20_GRBG               = 0x3F,  /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER20_GBRG               = 0x40,  /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    CU_EGL_COLOR_FORMAT_YVU444_PLANAR              = 0x41,  /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU422_PLANAR              = 0x42,  /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_PLANAR              = 0x43,  /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_BAYER_ISP_RGGB             = 0x44,  /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
+    CU_EGL_COLOR_FORMAT_BAYER_ISP_BGGR             = 0x45,  /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
+    CU_EGL_COLOR_FORMAT_BAYER_ISP_GRBG             = 0x46,  /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
+    CU_EGL_COLOR_FORMAT_BAYER_ISP_GBRG             = 0x47,  /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
+    CU_EGL_COLOR_FORMAT_BAYER_BCCR                 = 0x48,  /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_RCCB                 = 0x49,  /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_CRBC                 = 0x4A,  /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER_CBRC                 = 0x4B,  /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
+    CU_EGL_COLOR_FORMAT_BAYER10_CCCC               = 0x4C,  /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_BCCR               = 0x4D,  /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_RCCB               = 0x4E,  /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_CRBC               = 0x4F,  /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_CBRC               = 0x50,  /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_BAYER12_CCCC               = 0x51,  /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    CU_EGL_COLOR_FORMAT_Y                          = 0x52, /**< Color format for single Y plane. */
+    CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_2020     = 0x53, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_2020     = 0x54, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_PLANAR_2020         = 0x55, /**< Y, U, V  each in a separate  surface, U/V width = 1/2 Y width, U/V height= 1/2 Y height. */             
+    CU_EGL_COLOR_FORMAT_YVU420_PLANAR_2020         = 0x56, /**< Y, V, U each in a separate surface, U/V width = 1/2 Y width, U/V height
+= 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_709      = 0x57, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_709      = 0x58, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YUV420_PLANAR_709          = 0x59, /**< Y, U, V  each in a separate  surface, U/V width = 1/2 Y width, U/V height
+= 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_YVU420_PLANAR_709          = 0x5A,  /**< Y, V, U each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_709  = 0x5B, /**< Y10, V10U10 in two surfaces (VU as one surface), U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_2020 = 0x5C, /**< Y10, V10U10 in two surfaces (VU as one surface), U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR_2020 = 0x5D, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height  = Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR      = 0x5E, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height  = Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_422_SEMIPLANAR_709  = 0x5F, /**< Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height  = Y height. */
+    CU_EGL_COLOR_FORMAT_Y_ER                          = 0x60, /**< Extended Range Color format for single Y plane. */
+    CU_EGL_COLOR_FORMAT_Y_709_ER                      = 0x61, /**< Extended Range Color format for single Y plane. */
+    CU_EGL_COLOR_FORMAT_Y10_ER                        = 0x62, /**< Extended Range Color format for single Y10 plane. */
+    CU_EGL_COLOR_FORMAT_Y10_709_ER                    = 0x63, /**< Extended Range Color format for single Y10 plane. */
+    CU_EGL_COLOR_FORMAT_Y12_ER                        = 0x64, /**< Extended Range Color format for single Y12 plane. */
+    CU_EGL_COLOR_FORMAT_Y12_709_ER                    = 0x65, /**< Extended Range Color format for single Y12 plane. */
+    CU_EGL_COLOR_FORMAT_YUVA                          = 0x66, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
+    CU_EGL_COLOR_FORMAT_YUV                           = 0x67, /**< Y, U, V three channels in one surface, interleaved as VUY. Only pitch linear format supported. */
+    CU_EGL_COLOR_FORMAT_YVYU                          = 0x68, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
+    CU_EGL_COLOR_FORMAT_VYUY                          = 0x69, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_ER     = 0x6A, /**< Extended Range Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_709_ER = 0x6B, /**< Extended Range Y10, V10U10 in two surfaces(VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR_ER     = 0x6C, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */ 
+    CU_EGL_COLOR_FORMAT_Y10V10U10_444_SEMIPLANAR_709_ER = 0x6D, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface)  U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR_ER     = 0x6E, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ 
+    CU_EGL_COLOR_FORMAT_Y12V12U12_420_SEMIPLANAR_709_ER = 0x6F, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR_ER     = 0x70, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */ 
+    CU_EGL_COLOR_FORMAT_Y12V12U12_444_SEMIPLANAR_709_ER = 0x71, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    CU_EGL_COLOR_FORMAT_UYVY_709                        = 0x72, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
+    CU_EGL_COLOR_FORMAT_UYVY_709_ER                     = 0x73, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
+    CU_EGL_COLOR_FORMAT_UYVY_2020                       = 0x74, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
+    CU_EGL_COLOR_FORMAT_MAX
+} CUeglColorFormat;
+
+/**
+ * CUDA EGLFrame structure Descriptor - structure defining one frame of EGL.
+ *
+ * Each frame may contain one or more planes depending on whether the surface  * is Multiplanar or not.
+ */
+typedef struct CUeglFrame_st {
+    union {
+        CUarray pArray[MAX_PLANES];     /**< Array of CUarray corresponding to each plane*/
+        void*   pPitch[MAX_PLANES];     /**< Array of Pointers corresponding to each plane*/
+    } frame;
+    unsigned int width;                 /**< Width of first plane */
+    unsigned int height;                /**< Height of first plane */
+    unsigned int depth;                 /**< Depth of first plane */
+    unsigned int pitch;                 /**< Pitch of first plane */
+    unsigned int planeCount;            /**< Number of planes */
+    unsigned int numChannels;           /**< Number of channels for the plane */
+    CUeglFrameType frameType;           /**< Array or Pitch */
+    CUeglColorFormat eglColorFormat;    /**< CUDA EGL Color Format*/
+    CUarray_format cuFormat;            /**< CUDA Array Format*/
+} CUeglFrame_v1;
+typedef CUeglFrame_v1 CUeglFrame;
+
+/**
+  * CUDA EGLSream Connection
+  */
+typedef struct CUeglStreamConnection_st* CUeglStreamConnection;
+
+/** @} */ /* END CUDA_TYPES */
+
+/**
+ * \file cudaEGL.h
+ * \brief Header file for the EGL interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ */
+
+/**
+ * \defgroup CUDA_EGL EGL Interoperability
+ * \ingroup CUDA_DRIVER
+ *
+ * ___MANBRIEF___ EGL interoperability functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the EGL interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Registers an EGL image
+ *
+ * Registers the EGLImageKHR specified by \p image for access by
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
+ * Additional Mapping/Unmapping is not required for the registered resource and
+ * ::cuGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
+ *
+ * The application will be responsible for synchronizing access to shared objects.
+ * The application must ensure that any pending operation which access the objects have completed
+ * before passing control to CUDA. This may be accomplished by issuing and waiting for
+ * glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
+ * The application will be also responsible for ensuring that any pending operation on the
+ * registered CUDA resource has completed prior to executing subsequent commands in other APIs
+ * accesing the same memory objects.
+ * This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
+ *
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
+ * typedef void* EGLImageKHR
+ *
+ * \param pCudaResource   - Pointer to the returned object handle
+ * \param image           - An EGLImageKHR image which can be used to create target resource.
+ * \param flags           - Map flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuGraphicsEGLRegisterImage, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources,
+ * ::cudaGraphicsEGLRegisterImage
+ */
+CUresult CUDAAPI cuGraphicsEGLRegisterImage(CUgraphicsResource *pCudaResource, EGLImageKHR image, unsigned int flags);
+
+/**
+ * \brief Connect CUDA to EGLStream as a consumer.
+ *
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream.
+ *
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ * API to another.
+ *
+ * \param conn            - Pointer to the returned connection handle
+ * \param stream          - EGLStreamKHR handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ * ::cudaEGLStreamConsumerConnect
+ */
+CUresult CUDAAPI cuEGLStreamConsumerConnect(CUeglStreamConnection *conn, EGLStreamKHR stream);
+
+/**
+ * \brief Connect CUDA to EGLStream as a consumer with given flags.
+ *
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by CUeglResourceLocationFlags.
+ *
+ * The flags specify whether the consumer wants to access frames from system memory or video memory.
+ * Default is ::CU_EGL_RESOURCE_LOCATION_VIDMEM.
+ *
+ * \param conn              - Pointer to the returned connection handle
+ * \param stream            - EGLStreamKHR handle
+ * \param flags             - Flags denote intended location - system or video.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ * ::cudaEGLStreamConsumerConnectWithFlags
+ */
+
+CUresult CUDAAPI cuEGLStreamConsumerConnectWithFlags(CUeglStreamConnection *conn, EGLStreamKHR stream, unsigned int flags);
+
+/**
+ * \brief Disconnect CUDA as a consumer to EGLStream .
+ *
+ * Disconnect CUDA as a consumer to EGLStreamKHR.
+ *
+ * \param conn            - Conection to disconnect.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ * ::cudaEGLStreamConsumerDisconnect
+ */
+CUresult CUDAAPI cuEGLStreamConsumerDisconnect(CUeglStreamConnection *conn);
+
+/**
+ * \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
+ *
+ * Acquire an image frame from EGLStreamKHR. This API can also acquire an old frame presented
+ * by the producer unless explicitly disabled by setting EGL_SUPPORT_REUSE_NV flag to EGL_FALSE
+ * during stream initialization. By default, EGLStream is created with this flag set to EGL_TRUE.
+ * ::cuGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
+ * ::CUeglFrame.
+ *
+ * \param conn            - Connection on which to acquire
+ * \param pCudaResource   - CUDA resource on which the stream frame will be mapped for use.
+ * \param pStream         - CUDA stream for synchronization and any data migrations
+ *                          implied by ::CUeglResourceLocationFlags.
+ * \param timeout         - Desired timeout in usec for a new frame to be acquired.
+ *                          If set as ::CUDA_EGL_INFINITE_TIMEOUT, acquire waits infinitely.
+ *                          After timeout occurs CUDA consumer tries to acquire an old frame
+ *                          if available and EGL_SUPPORT_REUSE_NV flag is set.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ *
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ * ::cudaEGLStreamConsumerAcquireFrame
+ */
+CUresult CUDAAPI cuEGLStreamConsumerAcquireFrame(CUeglStreamConnection *conn,
+                                                  CUgraphicsResource *pCudaResource, CUstream *pStream, unsigned int timeout);
+/**
+ * \brief Releases the last frame acquired from the EGLStream.
+ *
+ * Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
+ * If EGL_SUPPORT_REUSE_NV flag is set to EGL_TRUE, at the time of EGL creation
+ * this API doesn't release the last frame acquired on the EGLStream.
+ * By default, EGLStream is created with this flag set to EGL_TRUE.
+ *
+ * \param conn            - Connection on which to release
+ * \param pCudaResource   - CUDA resource whose corresponding frame is to be released
+ * \param pStream         - CUDA stream on which release will be done.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ *
+ * \sa ::cuEGLStreamConsumerConnect, ::cuEGLStreamConsumerDisconnect,
+ * ::cuEGLStreamConsumerAcquireFrame, ::cuEGLStreamConsumerReleaseFrame,
+ * ::cudaEGLStreamConsumerReleaseFrame
+ */
+CUresult CUDAAPI cuEGLStreamConsumerReleaseFrame(CUeglStreamConnection *conn,
+                                                  CUgraphicsResource pCudaResource, CUstream *pStream);
+
+/**
+ * \brief Connect CUDA to EGLStream as a producer.
+ *
+ * Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
+ *
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ * API to another.
+ *
+ * \param conn   - Pointer to the returned connection handle
+ * \param stream - EGLStreamKHR handle
+ * \param width  - width of the image to be submitted to the stream
+ * \param height - height of the image to be submitted to the stream
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ * ::cuEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerConnect
+ */
+CUresult CUDAAPI cuEGLStreamProducerConnect(CUeglStreamConnection *conn, EGLStreamKHR stream,
+                                             EGLint width, EGLint height);
+
+/**
+ * \brief Disconnect CUDA as a producer  to EGLStream .
+ *
+ * Disconnect CUDA as a producer to EGLStreamKHR.
+ *
+ * \param conn            - Conection to disconnect.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ *
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ * ::cuEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerDisconnect
+ */
+CUresult CUDAAPI cuEGLStreamProducerDisconnect(CUeglStreamConnection *conn);
+
+/**
+ * \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
+ *
+ * When a frame is presented by the producer, it gets associated with the EGLStream
+ * and thus it is illegal to free the frame before the producer is disconnected.
+ * If a frame is freed and reused it may lead to undefined behavior.
+ *
+ * If producer and consumer are on different GPUs (iGPU and dGPU) then frametype
+ * ::CU_EGL_FRAME_TYPE_ARRAY is not supported. ::CU_EGL_FRAME_TYPE_PITCH can be used for
+ * such cross-device applications.
+ *
+ * The ::CUeglFrame is defined as:
+ * \code
+ * typedef struct CUeglFrame_st {
+ *     union {
+ *         CUarray pArray[MAX_PLANES];
+ *         void*   pPitch[MAX_PLANES];
+ *     } frame;
+ *     unsigned int width;
+ *     unsigned int height;
+ *     unsigned int depth;
+ *     unsigned int pitch;
+ *     unsigned int planeCount;
+ *     unsigned int numChannels;
+ *     CUeglFrameType frameType;
+ *     CUeglColorFormat eglColorFormat;
+ *     CUarray_format cuFormat;
+ * } CUeglFrame;
+ * \endcode
+ *
+ * For ::CUeglFrame of type ::CU_EGL_FRAME_TYPE_PITCH, the application may present sub-region of a memory
+ * allocation. In that case, the pitched pointer will specify the start address of the sub-region in
+ * the allocation and corresponding ::CUeglFrame fields will specify the dimensions of the sub-region.
+ * 
+ * \param conn            - Connection on which to present the CUDA array
+ * \param eglframe        - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
+ * \param pStream         - CUDA stream on which to present the frame.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ *
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ * ::cuEGLStreamProducerReturnFrame,
+ * ::cudaEGLStreamProducerPresentFrame
+ */
+CUresult CUDAAPI cuEGLStreamProducerPresentFrame(CUeglStreamConnection *conn,
+                                                 CUeglFrame eglframe, CUstream *pStream);
+
+/**
+ * \brief Return the CUDA eglFrame to the EGLStream released by the consumer.
+ *
+ * This API can potentially return CUDA_ERROR_LAUNCH_TIMEOUT if the consumer has not 
+ * returned a frame to EGL stream. If timeout is returned the application can retry.
+ *
+ * \param conn            - Connection on which to return
+ * \param eglframe        - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
+ * \param pStream         - CUDA stream on which to return the frame.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT
+ *
+ * \sa ::cuEGLStreamProducerConnect, ::cuEGLStreamProducerDisconnect,
+ * ::cuEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerReturnFrame
+ */
+CUresult CUDAAPI cuEGLStreamProducerReturnFrame(CUeglStreamConnection *conn,
+                                                CUeglFrame *eglframe, CUstream *pStream);
+
+/**
+ * \brief Get an eglFrame through which to access a registered EGL graphics resource.
+ *
+ * Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
+ * \p resource may be accessed.
+ * This API can only be called for registered EGL graphics resources.
+ *
+ * The ::CUeglFrame is defined as:
+ * \code
+ * typedef struct CUeglFrame_st {
+ *     union {
+ *         CUarray pArray[MAX_PLANES];
+ *         void*   pPitch[MAX_PLANES];
+ *     } frame;
+ *     unsigned int width;
+ *     unsigned int height;
+ *     unsigned int depth;
+ *     unsigned int pitch;
+ *     unsigned int planeCount;
+ *     unsigned int numChannels;
+ *     CUeglFrameType frameType;
+ *     CUeglColorFormat eglColorFormat;
+ *     CUarray_format cuFormat;
+ * } CUeglFrame;
+ * \endcode
+ *
+ * If \p resource is not registered then ::CUDA_ERROR_NOT_MAPPED is returned.
+ * *
+ * \param eglFrame   - Returned eglFrame.
+ * \param resource   - Registered resource to access.
+ * \param index      - Index for cubemap surfaces.
+ * \param mipLevel   - Mipmap level for the subresource to access.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsResourceGetMappedEglFrame
+ */
+CUresult CUDAAPI cuGraphicsResourceGetMappedEglFrame(CUeglFrame* eglFrame, CUgraphicsResource resource, unsigned int index, unsigned int mipLevel);
+
+/**
+ * \brief Creates an event from EGLSync object
+ *
+ * Creates an event *phEvent from an EGLSyncKHR eglSync with the flags specified
+ * via \p flags. Valid flags include:
+ * - ::CU_EVENT_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
+ * synchronization.  A CPU thread that uses ::cuEventSynchronize() to wait on
+ * an event created with this flag will block until the event has actually
+ * been completed.
+ *
+ * Once the \p eglSync gets destroyed, ::cuEventDestroy is the only API
+ * that can be invoked on the event.
+ *
+ * ::cuEventRecord and TimingData are not supported for events created from EGLSync.
+ *
+ * The EGLSyncKHR is an opaque handle to an EGL sync object.
+ * typedef void* EGLSyncKHR
+ *
+ * \param phEvent - Returns newly created event
+ * \param eglSync - Opaque handle to EGLSync object
+ * \param flags   - Event creation flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy
+ */
+CUresult CUDAAPI cuEventCreateFromEGLSync(CUevent *phEvent, EGLSyncKHR eglSync, unsigned int flags);
+
+/** @} */ /* END CUDA_EGL */
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaEGLTypedefs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaEGLTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..61b82337dc4bb280869934b11c2105db62ae20c3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaEGLTypedefs.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAEGLTYPEDEFS_H
+#define CUDAEGLTYPEDEFS_H
+
+#include <cudaEGL.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cudaEGL.h
+ */
+#define PFN_cuGraphicsEGLRegisterImage  PFN_cuGraphicsEGLRegisterImage_v7000
+#define PFN_cuEGLStreamConsumerConnect  PFN_cuEGLStreamConsumerConnect_v7000
+#define PFN_cuEGLStreamConsumerConnectWithFlags  PFN_cuEGLStreamConsumerConnectWithFlags_v8000
+#define PFN_cuEGLStreamConsumerDisconnect  PFN_cuEGLStreamConsumerDisconnect_v7000
+#define PFN_cuEGLStreamConsumerAcquireFrame  PFN_cuEGLStreamConsumerAcquireFrame_v7000
+#define PFN_cuEGLStreamConsumerReleaseFrame  PFN_cuEGLStreamConsumerReleaseFrame_v7000
+#define PFN_cuEGLStreamProducerConnect  PFN_cuEGLStreamProducerConnect_v7000
+#define PFN_cuEGLStreamProducerDisconnect  PFN_cuEGLStreamProducerDisconnect_v7000
+#define PFN_cuEGLStreamProducerPresentFrame  PFN_cuEGLStreamProducerPresentFrame_v7000
+#define PFN_cuEGLStreamProducerReturnFrame  PFN_cuEGLStreamProducerReturnFrame_v7000
+#define PFN_cuGraphicsResourceGetMappedEglFrame  PFN_cuGraphicsResourceGetMappedEglFrame_v7000
+#define PFN_cuEventCreateFromEGLSync  PFN_cuEventCreateFromEGLSync_v9000
+
+
+/**
+ * Type definitions for functions defined in cudaEGL.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuGraphicsEGLRegisterImage_v7000)(CUgraphicsResource CUDAAPI *pCudaResource, EGLImageKHR image, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerConnect_v7000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerConnectWithFlags_v8000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerDisconnect_v7000)(CUeglStreamConnection CUDAAPI *conn);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerAcquireFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUgraphicsResource CUDAAPI *pCudaResource, CUstream CUDAAPI *pStream, unsigned int timeout);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamConsumerReleaseFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUgraphicsResource pCudaResource, CUstream CUDAAPI *pStream);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerConnect_v7000)(CUeglStreamConnection CUDAAPI *conn, EGLStreamKHR stream, EGLint width, EGLint height);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerDisconnect_v7000)(CUeglStreamConnection CUDAAPI *conn);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerPresentFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUeglFrame_v1 eglframe, CUstream CUDAAPI *pStream);
+typedef CUresult (CUDAAPI *PFN_cuEGLStreamProducerReturnFrame_v7000)(CUeglStreamConnection CUDAAPI *conn, CUeglFrame_v1 CUDAAPI *eglframe, CUstream CUDAAPI *pStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedEglFrame_v7000)(CUeglFrame_v1 CUDAAPI *eglFrame, CUgraphicsResource resource, unsigned int index, unsigned int mipLevel);
+typedef CUresult (CUDAAPI *PFN_cuEventCreateFromEGLSync_v9000)(CUevent CUDAAPI *phEvent, EGLSyncKHR eglSync, unsigned int flags);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaGLTypedefs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaGLTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..81f0d5349e435159647af9af379d1e8e8441221c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaGLTypedefs.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAGLTYPEDEFS_H
+#define CUDAGLTYPEDEFS_H
+
+// Dependent includes for cudagl.h
+#include <GL/gl.h>
+
+#include <cudaGL.h>
+
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
+#else
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cudaGL.h
+ */
+#define PFN_cuGraphicsGLRegisterBuffer  PFN_cuGraphicsGLRegisterBuffer_v3000
+#define PFN_cuGraphicsGLRegisterImage  PFN_cuGraphicsGLRegisterImage_v3000
+#define PFN_cuWGLGetDevice  PFN_cuWGLGetDevice_v2020
+#define PFN_cuGLGetDevices  PFN_cuGLGetDevices_v6050
+#define PFN_cuGLCtxCreate  PFN_cuGLCtxCreate_v3020
+#define PFN_cuGLInit  PFN_cuGLInit_v2000
+#define PFN_cuGLRegisterBufferObject  PFN_cuGLRegisterBufferObject_v2000
+#define PFN_cuGLMapBufferObject  __API_TYPEDEF_PTDS(PFN_cuGLMapBufferObject, 3020, 7000)
+#define PFN_cuGLUnmapBufferObject  PFN_cuGLUnmapBufferObject_v2000
+#define PFN_cuGLUnregisterBufferObject  PFN_cuGLUnregisterBufferObject_v2000
+#define PFN_cuGLSetBufferObjectMapFlags  PFN_cuGLSetBufferObjectMapFlags_v2030
+#define PFN_cuGLMapBufferObjectAsync  __API_TYPEDEF_PTSZ(PFN_cuGLMapBufferObjectAsync, 3020, 7000)
+#define PFN_cuGLUnmapBufferObjectAsync  PFN_cuGLUnmapBufferObjectAsync_v2030
+
+
+/**
+ * Type definitions for functions defined in cudaGL.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterImage_v3000)(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
+#ifdef _WIN32
+typedef CUresult (CUDAAPI *PFN_cuWGLGetDevice_v2020)(CUdevice_v1 *pDevice, HGPUNV hGpu);
+#endif
+typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v6050)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v3020)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
+typedef CUresult (CUDAAPI *PFN_cuGLInit_v2000)(void);
+typedef CUresult (CUDAAPI *PFN_cuGLRegisterBufferObject_v2000)(GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v7000_ptds)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObject_v2000)(GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLUnregisterBufferObject_v2000)(GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLSetBufferObjectMapFlags_v2030)(GLuint buffer, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v7000_ptsz)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObjectAsync_v2030)(GLuint buffer, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
+
+/*
+ * Type definitions for older versioned functions in cuda.h
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v4010)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v2000)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v2030)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v2000)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
+#endif
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaProfilerTypedefs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaProfilerTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..bea7df4573aff2fa5b0d0029ce9d40a7ebe2de46
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaProfilerTypedefs.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAPROFILERTYPEDEFS_H
+#define CUDAPROFILERTYPEDEFS_H
+
+#include <cudaProfiler.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cudaProfiler.h
+ */
+#define PFN_cuProfilerInitialize  PFN_cuProfilerInitialize_v4000
+#define PFN_cuProfilerStart  PFN_cuProfilerStart_v4000
+#define PFN_cuProfilerStop  PFN_cuProfilerStop_v4000
+
+
+/**
+ * Type definitions for functions defined in cudaProfiler.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuProfilerInitialize_v4000)(const char *configFile, const char *outputFile, CUoutput_mode outputMode);
+typedef CUresult (CUDAAPI *PFN_cuProfilerStart_v4000)(void);
+typedef CUresult (CUDAAPI *PFN_cuProfilerStop_v4000)(void);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaTypedefs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..4957e9a07cdb10141586620f9e0f4d36ede345be
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaTypedefs.h
@@ -0,0 +1,1144 @@
+/*
+ * Copyright 2020-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDATYPEDEFS_H
+#define CUDATYPEDEFS_H
+
+#include <cuda.h>
+
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
+#else
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cuda.h
+ */
+#define PFN_cuGetErrorString  PFN_cuGetErrorString_v6000
+#define PFN_cuGetErrorName  PFN_cuGetErrorName_v6000
+#define PFN_cuInit  PFN_cuInit_v2000
+#define PFN_cuDriverGetVersion  PFN_cuDriverGetVersion_v2020
+#define PFN_cuDeviceGet  PFN_cuDeviceGet_v2000
+#define PFN_cuDeviceGetCount  PFN_cuDeviceGetCount_v2000
+#define PFN_cuDeviceGetName  PFN_cuDeviceGetName_v2000
+#define PFN_cuDeviceGetUuid  PFN_cuDeviceGetUuid_v11040
+#define PFN_cuDeviceGetLuid  PFN_cuDeviceGetLuid_v10000
+#define PFN_cuDeviceTotalMem  PFN_cuDeviceTotalMem_v3020
+#define PFN_cuDeviceGetTexture1DLinearMaxWidth  PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010
+#define PFN_cuDeviceGetAttribute  PFN_cuDeviceGetAttribute_v2000
+#define PFN_cuDeviceGetNvSciSyncAttributes  PFN_cuDeviceGetNvSciSyncAttributes_v10020
+#define PFN_cuDeviceSetMemPool  PFN_cuDeviceSetMemPool_v11020
+#define PFN_cuDeviceGetMemPool  PFN_cuDeviceGetMemPool_v11020
+#define PFN_cuDeviceGetDefaultMemPool  PFN_cuDeviceGetDefaultMemPool_v11020
+#define PFN_cuDeviceGetProperties  PFN_cuDeviceGetProperties_v2000
+#define PFN_cuDeviceComputeCapability  PFN_cuDeviceComputeCapability_v2000
+#define PFN_cuDevicePrimaryCtxRetain  PFN_cuDevicePrimaryCtxRetain_v7000
+#define PFN_cuDevicePrimaryCtxRelease  PFN_cuDevicePrimaryCtxRelease_v11000
+#define PFN_cuDevicePrimaryCtxSetFlags  PFN_cuDevicePrimaryCtxSetFlags_v11000
+#define PFN_cuDevicePrimaryCtxGetState  PFN_cuDevicePrimaryCtxGetState_v7000
+#define PFN_cuDevicePrimaryCtxReset  PFN_cuDevicePrimaryCtxReset_v11000
+#define PFN_cuDeviceGetExecAffinitySupport  PFN_cuDeviceGetExecAffinitySupport_v11040
+#define PFN_cuCtxCreate  PFN_cuCtxCreate_v11040
+#define PFN_cuCtxGetId  PFN_cuCtxGetId_v12000
+#define PFN_cuCtxDestroy  PFN_cuCtxDestroy_v4000
+#define PFN_cuCtxPushCurrent  PFN_cuCtxPushCurrent_v4000
+#define PFN_cuCtxPopCurrent  PFN_cuCtxPopCurrent_v4000
+#define PFN_cuCtxSetCurrent  PFN_cuCtxSetCurrent_v4000
+#define PFN_cuCtxGetCurrent  PFN_cuCtxGetCurrent_v4000
+#define PFN_cuCtxGetDevice  PFN_cuCtxGetDevice_v2000
+#define PFN_cuCtxGetFlags  PFN_cuCtxGetFlags_v7000
+#define PFN_cuCtxSetFlags  PFN_cuCtxSetFlags_v12010
+#define PFN_cuCtxSynchronize  PFN_cuCtxSynchronize_v2000
+#define PFN_cuCtxSetLimit  PFN_cuCtxSetLimit_v3010
+#define PFN_cuCtxGetLimit  PFN_cuCtxGetLimit_v3010
+#define PFN_cuCtxGetCacheConfig  PFN_cuCtxGetCacheConfig_v3020
+#define PFN_cuCtxSetCacheConfig  PFN_cuCtxSetCacheConfig_v3020
+#define PFN_cuCtxGetSharedMemConfig  PFN_cuCtxGetSharedMemConfig_v4020
+#define PFN_cuCtxSetSharedMemConfig  PFN_cuCtxSetSharedMemConfig_v4020
+#define PFN_cuCtxGetApiVersion  PFN_cuCtxGetApiVersion_v3020
+#define PFN_cuCtxGetStreamPriorityRange  PFN_cuCtxGetStreamPriorityRange_v5050
+#define PFN_cuCtxResetPersistingL2Cache  PFN_cuCtxResetPersistingL2Cache_v11000
+#define PFN_cuCtxAttach  PFN_cuCtxAttach_v2000
+#define PFN_cuCtxDetach  PFN_cuCtxDetach_v2000
+#define PFN_cuCtxGetExecAffinity  PFN_cuCtxGetExecAffinity_v11040
+#define PFN_cuModuleLoad  PFN_cuModuleLoad_v2000
+#define PFN_cuModuleLoadData  PFN_cuModuleLoadData_v2000
+#define PFN_cuModuleLoadDataEx  PFN_cuModuleLoadDataEx_v2010
+#define PFN_cuModuleLoadFatBinary  PFN_cuModuleLoadFatBinary_v2000
+#define PFN_cuModuleUnload  PFN_cuModuleUnload_v2000
+#define PFN_cuModuleGetFunction  PFN_cuModuleGetFunction_v2000
+#define PFN_cuModuleGetGlobal  PFN_cuModuleGetGlobal_v3020
+#define PFN_cuModuleGetTexRef  PFN_cuModuleGetTexRef_v2000
+#define PFN_cuModuleGetSurfRef  PFN_cuModuleGetSurfRef_v3000
+#define PFN_cuModuleGetFunctionCount PFN_cuModuleGetFunctionCount_v12040
+#define PFN_cuModuleEnumerateFunctions PFN_cuModuleEnumerateFunctions_v12040
+#define PFN_cuLinkCreate  PFN_cuLinkCreate_v6050
+#define PFN_cuLinkAddData  PFN_cuLinkAddData_v6050
+#define PFN_cuLinkAddFile  PFN_cuLinkAddFile_v6050
+#define PFN_cuLinkComplete  PFN_cuLinkComplete_v5050
+#define PFN_cuLinkDestroy  PFN_cuLinkDestroy_v5050
+#define PFN_cuMemGetInfo  PFN_cuMemGetInfo_v3020
+#define PFN_cuMemAlloc  PFN_cuMemAlloc_v3020
+#define PFN_cuMemAllocPitch  PFN_cuMemAllocPitch_v3020
+#define PFN_cuMemFree  PFN_cuMemFree_v3020
+#define PFN_cuMemGetAddressRange  PFN_cuMemGetAddressRange_v3020
+#define PFN_cuMemAllocHost  PFN_cuMemAllocHost_v3020
+#define PFN_cuMemFreeHost  PFN_cuMemFreeHost_v2000
+#define PFN_cuMemHostAlloc  PFN_cuMemHostAlloc_v2020
+#define PFN_cuMemHostGetDevicePointer  PFN_cuMemHostGetDevicePointer_v3020
+#define PFN_cuMemHostGetFlags  PFN_cuMemHostGetFlags_v2030
+#define PFN_cuMemAllocManaged  PFN_cuMemAllocManaged_v6000
+#define PFN_cuDeviceGetByPCIBusId  PFN_cuDeviceGetByPCIBusId_v4010
+#define PFN_cuDeviceGetPCIBusId  PFN_cuDeviceGetPCIBusId_v4010
+#define PFN_cuIpcGetEventHandle  PFN_cuIpcGetEventHandle_v4010
+#define PFN_cuIpcOpenEventHandle  PFN_cuIpcOpenEventHandle_v4010
+#define PFN_cuIpcGetMemHandle  PFN_cuIpcGetMemHandle_v4010
+#define PFN_cuIpcOpenMemHandle  PFN_cuIpcOpenMemHandle_v11000
+#define PFN_cuIpcCloseMemHandle  PFN_cuIpcCloseMemHandle_v4010
+#define PFN_cuMemHostRegister  PFN_cuMemHostRegister_v6050
+#define PFN_cuMemHostUnregister  PFN_cuMemHostUnregister_v4000
+#define PFN_cuMemcpy  __API_TYPEDEF_PTDS(PFN_cuMemcpy, 4000, 7000)
+#define PFN_cuMemcpyPeer  __API_TYPEDEF_PTDS(PFN_cuMemcpyPeer, 4000, 7000)
+#define PFN_cuMemcpyHtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoD, 3020, 7000)
+#define PFN_cuMemcpyDtoH  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoH, 3020, 7000)
+#define PFN_cuMemcpyDtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoD, 3020, 7000)
+#define PFN_cuMemcpyDtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoA, 3020, 7000)
+#define PFN_cuMemcpyAtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoD, 3020, 7000)
+#define PFN_cuMemcpyHtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoA, 3020, 7000)
+#define PFN_cuMemcpyAtoH  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoH, 3020, 7000)
+#define PFN_cuMemcpyAtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoA, 3020, 7000)
+#define PFN_cuMemcpy2D  __API_TYPEDEF_PTDS(PFN_cuMemcpy2D, 3020, 7000)
+#define PFN_cuMemcpy2DUnaligned  __API_TYPEDEF_PTDS(PFN_cuMemcpy2DUnaligned, 3020, 7000)
+#define PFN_cuMemcpy3D  __API_TYPEDEF_PTDS(PFN_cuMemcpy3D, 3020, 7000)
+#define PFN_cuMemcpy3DPeer  __API_TYPEDEF_PTDS(PFN_cuMemcpy3DPeer, 4000, 7000)
+#define PFN_cuMemcpyAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyAsync, 4000, 7000)
+#define PFN_cuMemcpyPeerAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyPeerAsync, 4000, 7000)
+#define PFN_cuMemcpyHtoDAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoDAsync, 3020, 7000)
+#define PFN_cuMemcpyDtoHAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoHAsync, 3020, 7000)
+#define PFN_cuMemcpyDtoDAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoDAsync, 3020, 7000)
+#define PFN_cuMemcpyHtoAAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoAAsync, 3020, 7000)
+#define PFN_cuMemcpyAtoHAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyAtoHAsync, 3020, 7000)
+#define PFN_cuMemcpy2DAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy2DAsync, 3020, 7000)
+#define PFN_cuMemcpy3DAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DAsync, 3020, 7000)
+#define PFN_cuMemcpy3DPeerAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DPeerAsync, 4000, 7000)
+#define PFN_cuMemcpyBatchAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyBatchAsync, 12080, 12080)
+#define PFN_cuMemcpy3DBatchAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DBatchAsync, 12080, 12080)
+#define PFN_cuMemsetD8  __API_TYPEDEF_PTDS(PFN_cuMemsetD8, 3020, 7000)
+#define PFN_cuMemsetD16  __API_TYPEDEF_PTDS(PFN_cuMemsetD16, 3020, 7000)
+#define PFN_cuMemsetD32  __API_TYPEDEF_PTDS(PFN_cuMemsetD32, 3020, 7000)
+#define PFN_cuMemsetD2D8  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D8, 3020, 7000)
+#define PFN_cuMemsetD2D16  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D16, 3020, 7000)
+#define PFN_cuMemsetD2D32  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D32, 3020, 7000)
+#define PFN_cuMemsetD8Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD8Async, 3020, 7000)
+#define PFN_cuMemsetD16Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD16Async, 3020, 7000)
+#define PFN_cuMemsetD32Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD32Async, 3020, 7000)
+#define PFN_cuMemsetD2D8Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D8Async, 3020, 7000)
+#define PFN_cuMemsetD2D16Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D16Async, 3020, 7000)
+#define PFN_cuMemsetD2D32Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D32Async, 3020, 7000)
+#define PFN_cuArrayCreate  PFN_cuArrayCreate_v3020
+#define PFN_cuArrayGetDescriptor  PFN_cuArrayGetDescriptor_v3020
+#define PFN_cuArrayGetSparseProperties  PFN_cuArrayGetSparseProperties_v11010
+#define PFN_cuMipmappedArrayGetSparseProperties  PFN_cuMipmappedArrayGetSparseProperties_v11010
+#define PFN_cuArrayGetMemoryRequirements  PFN_cuArrayGetMemoryRequirements_v11060
+#define PFN_cuMipmappedArrayGetMemoryRequirements  PFN_cuMipmappedArrayGetMemoryRequirements_v11060
+#define PFN_cuArrayGetPlane  PFN_cuArrayGetPlane_v11020
+#define PFN_cuArrayDestroy  PFN_cuArrayDestroy_v2000
+#define PFN_cuArray3DCreate  PFN_cuArray3DCreate_v3020
+#define PFN_cuArray3DGetDescriptor  PFN_cuArray3DGetDescriptor_v3020
+#define PFN_cuMipmappedArrayCreate  PFN_cuMipmappedArrayCreate_v5000
+#define PFN_cuMipmappedArrayGetLevel  PFN_cuMipmappedArrayGetLevel_v5000
+#define PFN_cuMipmappedArrayDestroy  PFN_cuMipmappedArrayDestroy_v5000
+#define PFN_cuMemAddressReserve  PFN_cuMemAddressReserve_v10020
+#define PFN_cuMemAddressFree  PFN_cuMemAddressFree_v10020
+#define PFN_cuMemCreate  PFN_cuMemCreate_v10020
+#define PFN_cuMemRelease  PFN_cuMemRelease_v10020
+#define PFN_cuMemMap  PFN_cuMemMap_v10020
+#define PFN_cuMemMapArrayAsync  __API_TYPEDEF_PTSZ(PFN_cuMemMapArrayAsync, 11010, 11010)
+#define PFN_cuMemUnmap  PFN_cuMemUnmap_v10020
+#define PFN_cuMemSetAccess  PFN_cuMemSetAccess_v10020
+#define PFN_cuMemGetAccess  PFN_cuMemGetAccess_v10020
+#define PFN_cuMemExportToShareableHandle  PFN_cuMemExportToShareableHandle_v10020
+#define PFN_cuMemImportFromShareableHandle  PFN_cuMemImportFromShareableHandle_v10020
+#define PFN_cuMemGetAllocationGranularity  PFN_cuMemGetAllocationGranularity_v10020
+#define PFN_cuMemGetAllocationPropertiesFromHandle  PFN_cuMemGetAllocationPropertiesFromHandle_v10020
+#define PFN_cuMemRetainAllocationHandle  PFN_cuMemRetainAllocationHandle_v11000
+#define PFN_cuMemFreeAsync  __API_TYPEDEF_PTSZ(PFN_cuMemFreeAsync, 11020, 11020)
+#define PFN_cuMemAllocAsync  __API_TYPEDEF_PTSZ(PFN_cuMemAllocAsync, 11020, 11020)
+#define PFN_cuMemPoolTrimTo  PFN_cuMemPoolTrimTo_v11020
+#define PFN_cuMemPoolSetAttribute  PFN_cuMemPoolSetAttribute_v11020
+#define PFN_cuMemPoolGetAttribute  PFN_cuMemPoolGetAttribute_v11020
+#define PFN_cuMemPoolSetAccess  PFN_cuMemPoolSetAccess_v11020
+#define PFN_cuMemPoolGetAccess  PFN_cuMemPoolGetAccess_v11020
+#define PFN_cuMemPoolCreate  PFN_cuMemPoolCreate_v11020
+#define PFN_cuMemPoolDestroy  PFN_cuMemPoolDestroy_v11020
+#define PFN_cuMemAllocFromPoolAsync  __API_TYPEDEF_PTSZ(PFN_cuMemAllocFromPoolAsync, 11020, 11020)
+#define PFN_cuMemPoolExportToShareableHandle  PFN_cuMemPoolExportToShareableHandle_v11020
+#define PFN_cuMemPoolImportFromShareableHandle  PFN_cuMemPoolImportFromShareableHandle_v11020
+#define PFN_cuMemPoolExportPointer  PFN_cuMemPoolExportPointer_v11020
+#define PFN_cuMemPoolImportPointer  PFN_cuMemPoolImportPointer_v11020
+#define PFN_cuPointerGetAttribute  PFN_cuPointerGetAttribute_v4000
+#define PFN_cuMemPrefetchAsync  __API_TYPEDEF_PTSZ(PFN_cuMemPrefetchAsync, 8000, 8000)
+#define PFN_cuMemAdvise  PFN_cuMemAdvise_v8000
+#define PFN_cuMemAdvise_v2  PFN_cuMemAdvise_v12020
+#define PFN_cuMemPrefetchAsync_v2  __API_TYPEDEF_PTSZ(PFN_cuMemPrefetchAsync, 12020, 12020)
+#define PFN_cuMemRangeGetAttribute  PFN_cuMemRangeGetAttribute_v8000
+#define PFN_cuMemRangeGetAttributes  PFN_cuMemRangeGetAttributes_v8000
+#define PFN_cuMulticastCreate  PFN_cuMulticastCreate_v12010
+#define PFN_cuMulticastAddDevice  PFN_cuMulticastAddDevice_v12010
+#define PFN_cuMulticastBindMem  PFN_cuMulticastBindMem_v12010
+#define PFN_cuMulticastBindAddr  PFN_cuMulticastBindAddr_v12010
+#define PFN_cuMulticastUnbind  PFN_cuMulticastUnbind_v12010
+#define PFN_cuMulticastGetGranularity  PFN_cuMulticastGetGranularity_v12010
+#define PFN_cuPointerSetAttribute  PFN_cuPointerSetAttribute_v6000
+#define PFN_cuPointerGetAttributes  PFN_cuPointerGetAttributes_v7000
+#define PFN_cuStreamCreate  PFN_cuStreamCreate_v2000
+#define PFN_cuStreamCreateWithPriority  PFN_cuStreamCreateWithPriority_v5050
+#define PFN_cuStreamGetId	__API_TYPEDEF_PTSZ(PFN_cuStreamGetId_v12000, 12000, 12000)
+#define PFN_cuStreamGetPriority  __API_TYPEDEF_PTSZ(PFN_cuStreamGetPriority, 5050, 7000)
+#define PFN_cuStreamGetFlags  __API_TYPEDEF_PTSZ(PFN_cuStreamGetFlags, 5050, 7000)
+#define PFN_cuStreamGetDevice __API_TYPEDEF_PTSZ(PFN_cuStreamGetDevice, 12080, 12080)
+#define PFN_cuStreamGetCtx  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCtx, 9020, 9020)
+#define PFN_cuStreamWaitEvent  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitEvent, 3020, 7000)
+#define PFN_cuStreamAddCallback  __API_TYPEDEF_PTSZ(PFN_cuStreamAddCallback, 5000, 7000)
+#define PFN_cuStreamBeginCapture  __API_TYPEDEF_PTSZ(PFN_cuStreamBeginCapture, 10010, 10010)
+#define PFN_cuStreamBeginCaptureToGraph  __API_TYPEDEF_PTSZ(PFN_cuStreamBeginCaptureToGraph, 12030, 12030)
+#define PFN_cuThreadExchangeStreamCaptureMode  PFN_cuThreadExchangeStreamCaptureMode_v10010
+#define PFN_cuStreamEndCapture  __API_TYPEDEF_PTSZ(PFN_cuStreamEndCapture, 10000, 10000)
+#define PFN_cuStreamIsCapturing  __API_TYPEDEF_PTSZ(PFN_cuStreamIsCapturing, 10000, 10000)
+#define PFN_cuStreamGetCaptureInfo  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 10010, 10010)
+#define PFN_cuStreamGetCaptureInfo_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 11030, 11030)
+#define PFN_cuStreamGetCaptureInfo_v3  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 12030, 12030)
+#define PFN_cuStreamUpdateCaptureDependencies  __API_TYPEDEF_PTSZ(PFN_cuStreamUpdateCaptureDependencies, 11030, 11030)
+#define PFN_cuStreamUpdateCaptureDependencies_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamUpdateCaptureDependencies, 12030, 12030)
+#define PFN_cuStreamAttachMemAsync  __API_TYPEDEF_PTSZ(PFN_cuStreamAttachMemAsync, 6000, 7000)
+#define PFN_cuStreamQuery  __API_TYPEDEF_PTSZ(PFN_cuStreamQuery, 2000, 7000)
+#define PFN_cuStreamSynchronize  __API_TYPEDEF_PTSZ(PFN_cuStreamSynchronize, 2000, 7000)
+#define PFN_cuStreamDestroy  PFN_cuStreamDestroy_v4000
+#define PFN_cuStreamCopyAttributes  __API_TYPEDEF_PTSZ(PFN_cuStreamCopyAttributes, 11000, 11000)
+#define PFN_cuStreamGetAttribute  __API_TYPEDEF_PTSZ(PFN_cuStreamGetAttribute, 11000, 11000)
+#define PFN_cuStreamSetAttribute  __API_TYPEDEF_PTSZ(PFN_cuStreamSetAttribute, 11000, 11000)
+#define PFN_cuEventCreate  PFN_cuEventCreate_v2000
+#define PFN_cuEventRecord  __API_TYPEDEF_PTSZ(PFN_cuEventRecord, 2000, 7000)
+#define PFN_cuEventRecordWithFlags  __API_TYPEDEF_PTSZ(PFN_cuEventRecordWithFlags, 11010, 11010)
+#define PFN_cuEventQuery  PFN_cuEventQuery_v2000
+#define PFN_cuEventSynchronize  PFN_cuEventSynchronize_v2000
+#define PFN_cuEventDestroy  PFN_cuEventDestroy_v4000
+#define PFN_cuEventElapsedTime  PFN_cuEventElapsedTime_v2000
+#define PFN_cuEventElapsedTime_v2  PFN_cuEventElapsedTime_v12080
+#define PFN_cuImportExternalMemory  PFN_cuImportExternalMemory_v10000
+#define PFN_cuExternalMemoryGetMappedBuffer  PFN_cuExternalMemoryGetMappedBuffer_v10000
+#define PFN_cuExternalMemoryGetMappedMipmappedArray  PFN_cuExternalMemoryGetMappedMipmappedArray_v10000
+#define PFN_cuDestroyExternalMemory  PFN_cuDestroyExternalMemory_v10000
+#define PFN_cuImportExternalSemaphore  PFN_cuImportExternalSemaphore_v10000
+#define PFN_cuSignalExternalSemaphoresAsync  __API_TYPEDEF_PTSZ(PFN_cuSignalExternalSemaphoresAsync, 10000, 10000)
+#define PFN_cuWaitExternalSemaphoresAsync  __API_TYPEDEF_PTSZ(PFN_cuWaitExternalSemaphoresAsync, 10000, 10000)
+#define PFN_cuDestroyExternalSemaphore  PFN_cuDestroyExternalSemaphore_v10000
+#define PFN_cuStreamWaitValue32  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 8000, 8000)
+#define PFN_cuStreamWaitValue64  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 9000, 9000)
+#define PFN_cuStreamWriteValue32  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 8000, 8000)
+#define PFN_cuStreamWriteValue64  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 9000, 9000)
+#define PFN_cuStreamBatchMemOp  __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 8000, 8000)
+#define PFN_cuStreamWaitValue32_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 11070, 11070)
+#define PFN_cuStreamWaitValue64_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 11070, 11070)
+#define PFN_cuStreamWriteValue32_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 11070, 11070)
+#define PFN_cuStreamWriteValue64_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 11070, 11070)
+#define PFN_cuStreamBatchMemOp_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 11070, 11070)
+#define PFN_cuFuncGetAttribute  PFN_cuFuncGetAttribute_v2020
+#define PFN_cuFuncSetAttribute  PFN_cuFuncSetAttribute_v9000
+#define PFN_cuFuncSetCacheConfig  PFN_cuFuncSetCacheConfig_v3000
+#define PFN_cuFuncSetSharedMemConfig  PFN_cuFuncSetSharedMemConfig_v4020
+#define PFN_cuFuncGetName  PFN_cuFuncGetName_v12030
+#define PFN_cuFuncGetParamInfo  PFN_cuFuncGetParamInfo_v12040
+#define PFN_cuFuncIsLoaded PFN_cuFuncIsLoaded_v12040
+#define PFN_cuFuncLoad PFN_cuFuncLoad_v12040
+#define PFN_cuLaunchKernel  __API_TYPEDEF_PTSZ(PFN_cuLaunchKernel, 4000, 7000)
+#define PFN_cuLaunchKernelEx __API_TYPEDEF_PTSZ(PFN_cuLaunchKernelEx, 11060, 11060)
+#define PFN_cuLaunchCooperativeKernel  __API_TYPEDEF_PTSZ(PFN_cuLaunchCooperativeKernel, 9000, 9000)
+#define PFN_cuLaunchCooperativeKernelMultiDevice  PFN_cuLaunchCooperativeKernelMultiDevice_v9000
+#define PFN_cuLaunchHostFunc  __API_TYPEDEF_PTSZ(PFN_cuLaunchHostFunc, 10000, 10000)
+#define PFN_cuFuncSetBlockShape  PFN_cuFuncSetBlockShape_v2000
+#define PFN_cuFuncSetSharedSize  PFN_cuFuncSetSharedSize_v2000
+#define PFN_cuParamSetSize  PFN_cuParamSetSize_v2000
+#define PFN_cuParamSeti  PFN_cuParamSeti_v2000
+#define PFN_cuParamSetf  PFN_cuParamSetf_v2000
+#define PFN_cuParamSetv  PFN_cuParamSetv_v2000
+#define PFN_cuLaunch  PFN_cuLaunch_v2000
+#define PFN_cuLaunchGrid  PFN_cuLaunchGrid_v2000
+#define PFN_cuLaunchGridAsync  PFN_cuLaunchGridAsync_v2000
+#define PFN_cuParamSetTexRef  PFN_cuParamSetTexRef_v2000
+#define PFN_cuGraphCreate  PFN_cuGraphCreate_v10000
+#define PFN_cuGraphAddKernelNode  PFN_cuGraphAddKernelNode_v12000
+#define PFN_cuGraphKernelNodeGetParams  PFN_cuGraphKernelNodeGetParams_v12000
+#define PFN_cuGraphKernelNodeSetParams  PFN_cuGraphKernelNodeSetParams_v12000
+#define PFN_cuGraphAddMemcpyNode  PFN_cuGraphAddMemcpyNode_v10000
+#define PFN_cuGraphMemcpyNodeGetParams  PFN_cuGraphMemcpyNodeGetParams_v10000
+#define PFN_cuGraphMemcpyNodeSetParams  PFN_cuGraphMemcpyNodeSetParams_v10000
+#define PFN_cuGraphAddMemsetNode  PFN_cuGraphAddMemsetNode_v10000
+#define PFN_cuGraphMemsetNodeGetParams  PFN_cuGraphMemsetNodeGetParams_v10000
+#define PFN_cuGraphMemsetNodeSetParams  PFN_cuGraphMemsetNodeSetParams_v10000
+#define PFN_cuGraphAddHostNode  PFN_cuGraphAddHostNode_v10000
+#define PFN_cuGraphHostNodeGetParams  PFN_cuGraphHostNodeGetParams_v10000
+#define PFN_cuGraphHostNodeSetParams  PFN_cuGraphHostNodeSetParams_v10000
+#define PFN_cuGraphAddChildGraphNode  PFN_cuGraphAddChildGraphNode_v10000
+#define PFN_cuGraphChildGraphNodeGetGraph  PFN_cuGraphChildGraphNodeGetGraph_v10000
+#define PFN_cuGraphAddEmptyNode  PFN_cuGraphAddEmptyNode_v10000
+#define PFN_cuGraphAddEventRecordNode  PFN_cuGraphAddEventRecordNode_v11010
+#define PFN_cuGraphEventRecordNodeGetEvent  PFN_cuGraphEventRecordNodeGetEvent_v11010
+#define PFN_cuGraphEventRecordNodeSetEvent  PFN_cuGraphEventRecordNodeSetEvent_v11010
+#define PFN_cuGraphAddEventWaitNode  PFN_cuGraphAddEventWaitNode_v11010
+#define PFN_cuGraphEventWaitNodeGetEvent  PFN_cuGraphEventWaitNodeGetEvent_v11010
+#define PFN_cuGraphEventWaitNodeSetEvent  PFN_cuGraphEventWaitNodeSetEvent_v11010
+#define PFN_cuGraphAddExternalSemaphoresSignalNode  PFN_cuGraphAddExternalSemaphoresSignalNode_v11020
+#define PFN_cuGraphExternalSemaphoresSignalNodeGetParams  PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020
+#define PFN_cuGraphExternalSemaphoresSignalNodeSetParams  PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020
+#define PFN_cuGraphAddExternalSemaphoresWaitNode  PFN_cuGraphAddExternalSemaphoresWaitNode_v11020
+#define PFN_cuGraphExternalSemaphoresWaitNodeGetParams  PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020
+#define PFN_cuGraphExternalSemaphoresWaitNodeSetParams  PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020
+#define PFN_cuGraphAddBatchMemOpNode PFN_cuGraphAddBatchMemOpNode_v11070
+#define PFN_cuGraphBatchMemOpNodeGetParams PFN_cuGraphBatchMemOpNodeGetParams_v11070
+#define PFN_cuGraphBatchMemOpNodeSetParams PFN_cuGraphBatchMemOpNodeSetParams _v11070
+#define PFN_cuGraphExecBatchMemOpNodeSetParams PFN_cuGraphExecBatchMemOpNodeSetParams_v11070
+#define PFN_cuGraphClone  PFN_cuGraphClone_v10000
+#define PFN_cuGraphNodeFindInClone  PFN_cuGraphNodeFindInClone_v10000
+#define PFN_cuGraphNodeGetType  PFN_cuGraphNodeGetType_v10000
+#define PFN_cuGraphGetNodes  PFN_cuGraphGetNodes_v10000
+#define PFN_cuGraphGetRootNodes  PFN_cuGraphGetRootNodes_v10000
+#define PFN_cuGraphGetEdges  PFN_cuGraphGetEdges_v12030
+#define PFN_cuGraphNodeGetDependencies  PFN_cuGraphNodeGetDependencies_v12030
+#define PFN_cuGraphNodeGetDependentNodes  PFN_cuGraphNodeGetDependentNodes_v12030
+#define PFN_cuGraphAddDependencies  PFN_cuGraphAddDependencies_v12030
+#define PFN_cuGraphRemoveDependencies  PFN_cuGraphRemoveDependencies_v12030
+#define PFN_cuGraphDestroyNode  PFN_cuGraphDestroyNode_v10000
+
+#define PFN_cuGraphInstantiate  PFN_cuGraphInstantiateWithFlags_v11040
+
+#define PFN_cuGraphInstantiateWithFlags  PFN_cuGraphInstantiateWithFlags_v11040
+#define PFN_cuGraphInstantiateWithParams  __API_TYPEDEF_PTSZ(PFN_cuGraphInstantiateWithParams, 12000, 12000)
+#define PFN_cuGraphExecGetFlags  PFN_cuGraphExecGetFlags_v12000
+#define PFN_cuGraphExecKernelNodeSetParams  PFN_cuGraphExecKernelNodeSetParams_v12000
+#define PFN_cuGraphExecMemcpyNodeSetParams  PFN_cuGraphExecMemcpyNodeSetParams_v10020
+#define PFN_cuGraphExecMemsetNodeSetParams  PFN_cuGraphExecMemsetNodeSetParams_v10020
+#define PFN_cuGraphExecHostNodeSetParams  PFN_cuGraphExecHostNodeSetParams_v10020
+#define PFN_cuGraphExecChildGraphNodeSetParams  PFN_cuGraphExecChildGraphNodeSetParams_v11010
+#define PFN_cuGraphExecEventRecordNodeSetEvent  PFN_cuGraphExecEventRecordNodeSetEvent_v11010
+#define PFN_cuGraphExecEventWaitNodeSetEvent  PFN_cuGraphExecEventWaitNodeSetEvent_v11010
+#define PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams  PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020
+#define PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams  PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020
+#define PFN_cuGraphUpload  __API_TYPEDEF_PTSZ(PFN_cuGraphUpload, 11010, 11010)
+#define PFN_cuGraphLaunch  __API_TYPEDEF_PTSZ(PFN_cuGraphLaunch, 10000, 10000)
+#define PFN_cuGraphExecDestroy  PFN_cuGraphExecDestroy_v10000
+#define PFN_cuGraphDestroy  PFN_cuGraphDestroy_v10000
+#define PFN_cuGraphExecUpdate  PFN_cuGraphExecUpdate_v12000
+#define PFN_cuGraphKernelNodeCopyAttributes  PFN_cuGraphKernelNodeCopyAttributes_v11000
+#define PFN_cuGraphKernelNodeGetAttribute  PFN_cuGraphKernelNodeGetAttribute_v11000
+#define PFN_cuGraphKernelNodeSetAttribute  PFN_cuGraphKernelNodeSetAttribute_v11000
+#define PFN_cuGraphDebugDotPrint  PFN_cuGraphDebugDotPrint_v11030
+#define PFN_cuGraphAddMemAllocNode  PFN_cuGraphAddMemAllocNode_v11040
+#define PFN_cuGraphMemAllocNodeGetParams PFN_cuGraphMemAllocNodeGetParams_v11040
+#define PFN_cuGraphAddMemFreeNode  PFN_cuGraphAddMemFreeNode_v11040
+#define PFN_cuGraphMemFreeNodeGetParams PFN_cuGraphMemFreeNodeGetParams_v11040
+#define PFN_cuGraphNodeSetEnabled PFN_cuGraphNodeSetEnabled_v11060
+#define PFN_cuGraphNodeGetEnabled PFN_cuGraphNodeGetEnabled_v11060
+#define PFN_cuGraphAddNode PFN_cuGraphAddNode_v12030
+#define PFN_cuGraphNodeSetParams PFN_cuGraphNodeSetParams_v12020
+#define PFN_cuGraphExecNodeSetParams PFN_cuGraphExecNodeSetParams_v12020
+#define PFN_GraphConditionalHandleCreate PFN_cuGraphConditionalHandleCreate_v12030
+#define PFN_cuDeviceGraphMemTrim  PFN_cuDeviceGraphMemTrim_v11040
+#define PFN_cuDeviceGetGraphMemAttribute  PFN_cuDeviceGetGraphMemAttribute_v11040
+#define PFN_cuDeviceSetGraphMemAttribute  PFN_cuDeviceSetGraphMemAttribute_v11040
+#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor  PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050
+#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags  PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000
+#define PFN_cuOccupancyMaxPotentialBlockSize  PFN_cuOccupancyMaxPotentialBlockSize_v6050
+#define PFN_cuOccupancyMaxPotentialBlockSizeWithFlags  PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000
+#define PFN_cuOccupancyAvailableDynamicSMemPerBlock  PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020
+#define PFN_cuOccupancyMaxPotentialClusterSize  PFN_cuOccupancyMaxPotentialClusterSize_v11070
+#define PFN_cuOccupancyMaxActiveClusters  PFN_cuOccupancyMaxActiveClusters_v11070
+#define PFN_cuTexRefSetArray  PFN_cuTexRefSetArray_v2000
+#define PFN_cuTexRefSetMipmappedArray  PFN_cuTexRefSetMipmappedArray_v5000
+#define PFN_cuTexRefSetAddress  PFN_cuTexRefSetAddress_v3020
+#define PFN_cuTexRefSetAddress2D  PFN_cuTexRefSetAddress2D_v4010
+#define PFN_cuTexRefSetFormat  PFN_cuTexRefSetFormat_v2000
+#define PFN_cuTexRefSetAddressMode  PFN_cuTexRefSetAddressMode_v2000
+#define PFN_cuTexRefSetFilterMode  PFN_cuTexRefSetFilterMode_v2000
+#define PFN_cuTexRefSetMipmapFilterMode  PFN_cuTexRefSetMipmapFilterMode_v5000
+#define PFN_cuTexRefSetMipmapLevelBias  PFN_cuTexRefSetMipmapLevelBias_v5000
+#define PFN_cuTexRefSetMipmapLevelClamp  PFN_cuTexRefSetMipmapLevelClamp_v5000
+#define PFN_cuTexRefSetMaxAnisotropy  PFN_cuTexRefSetMaxAnisotropy_v5000
+#define PFN_cuTexRefSetBorderColor  PFN_cuTexRefSetBorderColor_v8000
+#define PFN_cuTexRefSetFlags  PFN_cuTexRefSetFlags_v2000
+#define PFN_cuTexRefGetAddress  PFN_cuTexRefGetAddress_v3020
+#define PFN_cuTexRefGetArray  PFN_cuTexRefGetArray_v2000
+#define PFN_cuTexRefGetMipmappedArray  PFN_cuTexRefGetMipmappedArray_v5000
+#define PFN_cuTexRefGetAddressMode  PFN_cuTexRefGetAddressMode_v2000
+#define PFN_cuTexRefGetFilterMode  PFN_cuTexRefGetFilterMode_v2000
+#define PFN_cuTexRefGetFormat  PFN_cuTexRefGetFormat_v2000
+#define PFN_cuTexRefGetMipmapFilterMode  PFN_cuTexRefGetMipmapFilterMode_v5000
+#define PFN_cuTexRefGetMipmapLevelBias  PFN_cuTexRefGetMipmapLevelBias_v5000
+#define PFN_cuTexRefGetMipmapLevelClamp  PFN_cuTexRefGetMipmapLevelClamp_v5000
+#define PFN_cuTexRefGetMaxAnisotropy  PFN_cuTexRefGetMaxAnisotropy_v5000
+#define PFN_cuTexRefGetBorderColor  PFN_cuTexRefGetBorderColor_v8000
+#define PFN_cuTexRefGetFlags  PFN_cuTexRefGetFlags_v2000
+#define PFN_cuTexRefCreate  PFN_cuTexRefCreate_v2000
+#define PFN_cuTexRefDestroy  PFN_cuTexRefDestroy_v2000
+#define PFN_cuSurfRefSetArray  PFN_cuSurfRefSetArray_v3000
+#define PFN_cuSurfRefGetArray  PFN_cuSurfRefGetArray_v3000
+#define PFN_cuTexObjectCreate  PFN_cuTexObjectCreate_v5000
+#define PFN_cuTexObjectDestroy  PFN_cuTexObjectDestroy_v5000
+#define PFN_cuTexObjectGetResourceDesc  PFN_cuTexObjectGetResourceDesc_v5000
+#define PFN_cuTexObjectGetTextureDesc  PFN_cuTexObjectGetTextureDesc_v5000
+#define PFN_cuTexObjectGetResourceViewDesc  PFN_cuTexObjectGetResourceViewDesc_v5000
+#define PFN_cuSurfObjectCreate  PFN_cuSurfObjectCreate_v5000
+#define PFN_cuSurfObjectDestroy  PFN_cuSurfObjectDestroy_v5000
+#define PFN_cuSurfObjectGetResourceDesc  PFN_cuSurfObjectGetResourceDesc_v5000
+#define PFN_cuTensorMapEncodeTiled  PFN_cuTensorMapEncodeTiled_v12000
+#define PFN_cuTensorMapEncodeIm2col  PFN_cuTensorMapEncodeIm2col_v12000
+#define PFN_cuTensorMapReplaceAddress  PFN_cuTensorMapReplaceAddress_v12000
+#define PFN_cuTensorMapEncodeIm2colWide  PFN_cuTensorMapEncodeIm2colWide_v12080
+#define PFN_cuDeviceCanAccessPeer  PFN_cuDeviceCanAccessPeer_v4000
+#define PFN_cuCtxEnablePeerAccess  PFN_cuCtxEnablePeerAccess_v4000
+#define PFN_cuCtxDisablePeerAccess  PFN_cuCtxDisablePeerAccess_v4000
+#define PFN_cuDeviceGetP2PAttribute  PFN_cuDeviceGetP2PAttribute_v8000
+#define PFN_cuGraphicsUnregisterResource  PFN_cuGraphicsUnregisterResource_v3000
+#define PFN_cuGraphicsSubResourceGetMappedArray  PFN_cuGraphicsSubResourceGetMappedArray_v3000
+#define PFN_cuGraphicsResourceGetMappedMipmappedArray  PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000
+#define PFN_cuGraphicsResourceGetMappedPointer  PFN_cuGraphicsResourceGetMappedPointer_v3020
+#define PFN_cuGraphicsResourceSetMapFlags  PFN_cuGraphicsResourceSetMapFlags_v6050
+#define PFN_cuGraphicsMapResources  __API_TYPEDEF_PTSZ(PFN_cuGraphicsMapResources, 3000, 7000)
+#define PFN_cuGraphicsUnmapResources  __API_TYPEDEF_PTSZ(PFN_cuGraphicsUnmapResources, 3000, 7000)
+#define PFN_cuGetExportTable  PFN_cuGetExportTable_v3000
+#define PFN_cuFuncGetModule  PFN_cuFuncGetModule_v11000
+#define PFN_cuFlushGPUDirectRDMAWrites PFN_cuFlushGPUDirectRDMAWrites_v11030
+#define PFN_cuGetProcAddress  PFN_cuGetProcAddress_v12000
+#define PFN_cuUserObjectCreate  PFN_cuUserObjectCreate_v11030
+#define PFN_cuUserObjectRetain  PFN_cuUserObjectRetain_v11030
+#define PFN_cuUserObjectRelease  PFN_cuUserObjectRelease_v11030
+#define PFN_cuGraphRetainUserObject  PFN_cuGraphRetainUserObject_v11030
+#define PFN_cuGraphReleaseUserObject  PFN_cuGraphReleaseUserObject_v11030
+#define PFN_cuModuleGetLoadingMode  PFN_cuModuleGetLoadingMode_v11070
+#define PFN_cuMemGetHandleForAddressRange  PFN_cuMemGetHandleForAddressRange_v11070
+#define PFN_cuLibraryLoadData PFN_cuLibraryLoadData_v12000
+#define PFN_cuLibraryLoadFromFile PFN_cuLibraryLoadFromFile_v12000
+#define PFN_cuLibraryUnload PFN_cuLibraryUnload_v12000
+#define PFN_cuLibraryGetKernel PFN_cuLibraryGetKernel_v12000
+#define PFN_cuLibraryGetModule PFN_cuLibraryGetModule_v12000
+#define PFN_cuKernelGetFunction PFN_cuKernelGetFunction_v12000
+#define PFN_cuKernelGetLibrary PFN_cuKernelGetLibrary_v12050
+#define PFN_cuLibraryGetGlobal PFN_cuLibraryGetGlobal_v12000
+#define PFN_cuLibraryGetManaged PFN_cuLibraryGetManaged_v12000
+#define PFN_cuLibraryGetKernelCount PFN_cuLibraryGetKernelCount_v12040
+#define PFN_cuLibraryEnumerateKernels PFN_cuLibraryEnumerateKernels_v12040
+#define PFN_cuKernelGetAttribute PFN_cuKernelGetAttribute_v12000
+#define PFN_cuKernelSetAttribute PFN_cuKernelSetAttribute_v12000
+#define PFN_cuKernelSetCacheConfig PFN_cuKernelSetCacheConfig_v12000
+#define PFN_cuKernelGetName  PFN_cuKernelGetName_v12030
+#define PFN_cuKernelGetParamInfo  PFN_cuKernelGetParamInfo_v12040
+#define PFN_cuLibraryGetUnifiedFunction PFN_cuLibraryGetUnifiedFunction_v12000
+#define PFN_cuCoredumpGetAttribute PFN_cuCoredumpGetAttribute_v12010
+#define PFN_cuCoredumpGetAttributeGlobal PFN_cuCoredumpGetAttributeGlobal_v12010
+#define PFN_cuCoredumpSetAttribute PFN_cuCoredumpSetAttribute_v12010
+#define PFN_cuCoredumpSetAttributeGlobal PFN_cuCoredumpSetAttributeGlobal_v12010
+#define PFN_cuDeviceRegisterAsyncNotification PFN_cuDeviceRegisterAsyncNotification_v12040
+#define PFN_cuDeviceUnregisterAsyncNotification PFN_cuDeviceUnregisterAsyncNotification_v12040
+#define PFN_cuGreenCtxCreate PFN_cuGreenCtxCreate_v12040
+#define PFN_cuGreenCtxDestroy PFN_cuGreenCtxDestroy_v12040
+#define PFN_cuDeviceGetDevResource PFN_cuDeviceGetDevResource_v12040
+#define PFN_cuCtxGetDevResource PFN_cuCtxGetDevResource_v12040
+#define PFN_cuGreenCtxGetDevResource PFN_cuGreenCtxGetDevResource_v12040
+#define PFN_cuGreenCtxRecordEvent PFN_cuGreenCtxRecordEvent_v12040
+#define PFN_cuGreenCtxWaitEvent PFN_cuGreenCtxWaitEvent_v12040
+#define PFN_cuDevResourceGenerateDesc PFN_cuDevResourceGenerateDesc_v12040
+#define PFN_cuDevSmResourceSplitByCount PFN_cuDevSmResourceSplitByCount_v12040
+#define PFN_cuStreamGetGreenCtx PFN_cuStreamGetGreenCtx_v12040
+#define PFN_cuCtxFromGreenCtx PFN_cuCtxFromGreenCtx_v12040
+#define PFN_cuCtxRecordEvent PFN_cuCtxRecordEvent_v12050
+#define PFN_cuCtxWaitEvent PFN_cuCtxWaitEvent_v12050
+#define PFN_cuGreenCtxStreamCreate PFN_cuGreenCtxStreamCreate_v12050
+#define PFN_cuStreamGetCtx_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCtx, 12050, 12050)
+#define PFN_cuMemBatchDecompressAsync __API_TYPEDEF_PTSZ(PFN_cuMemBatchDecompressAsync, 12060, 12060)
+
+#define PFN_cuCheckpointProcessGetRestoreThreadId PFN_cuCheckpointProcessGetRestoreThreadId_v12080
+#define PFN_cuCheckpointProcessGetState PFN_cuCheckpointProcessGetState_v12080
+#define PFN_cuCheckpointProcessLock PFN_cuCheckpointProcessLock_v12080
+#define PFN_cuCheckpointProcessCheckpoint PFN_cuCheckpointProcessCheckpoint_v12080
+#define PFN_cuCheckpointProcessRestore PFN_cuCheckpointProcessRestore_v12080
+#define PFN_cuCheckpointProcessUnlock PFN_cuCheckpointProcessUnlock_v12080
+
+/*
+ * Type definitions for functions defined in cuda.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuGetErrorString_v6000)(CUresult error, const char **pStr);
+typedef CUresult (CUDAAPI *PFN_cuGetErrorName_v6000)(CUresult error, const char **pStr);
+typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGet_v2000)(CUdevice_v1 *device, int ordinal);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetCount_v2000)(int *count);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetName_v2000)(char *name, int len, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v9020)(CUuuid *uuid, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v11040)(CUuuid *uuid, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetLuid_v10000)(char *luid, unsigned int *deviceNodeMask, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v3020)(size_t *bytes, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010)(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetAttribute_v2000)(int *pi, CUdevice_attribute attrib, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetNvSciSyncAttributes_v10020)(void *nvSciSyncAttrList, CUdevice_v1 dev, int flags);
+typedef CUresult (CUDAAPI *PFN_cuDeviceSetMemPool_v11020)(CUdevice_v1 dev, CUmemoryPool pool);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetMemPool_v11020)(CUmemoryPool *pool, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetDefaultMemPool_v11020)(CUmemoryPool *pool_out, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetProperties_v2000)(CUdevprop_v1 *prop, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceComputeCapability_v2000)(int *major, int *minor, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRetain_v7000)(CUcontext *pctx, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v11000)(CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v11000)(CUdevice_v1 dev, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxGetState_v7000)(CUdevice_v1 dev, unsigned int *flags, int *active);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v11000)(CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetExecAffinitySupport_v11040)(int *pi, CUexecAffinityType type, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v3020)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v11040)(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v12050)(CUcontext *pctx, CUctxCreateParams *ctxCreateParams, unsigned int flags, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetId_v12000)(CUcontext ctx, unsigned long long *ctxId);
+typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v4000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v4000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v4000)(CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetCurrent_v4000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetCurrent_v4000)(CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetDevice_v2000)(CUdevice_v1 *device);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetFlags_v7000)(unsigned int *flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetFlags_v12010)(unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxSynchronize_v2000)(void);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetLimit_v3010)(CUlimit limit, size_t value);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetLimit_v3010)(size_t *pvalue, CUlimit limit);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetCacheConfig_v3020)(CUfunc_cache *pconfig);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetCacheConfig_v3020)(CUfunc_cache config);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetSharedMemConfig_v4020)(CUsharedconfig *pConfig);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetSharedMemConfig_v4020)(CUsharedconfig config);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetApiVersion_v3020)(CUcontext ctx, unsigned int *version);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetStreamPriorityRange_v5050)(int *leastPriority, int *greatestPriority);
+typedef CUresult (CUDAAPI *PFN_cuCtxResetPersistingL2Cache_v11000)(void);
+typedef CUresult (CUDAAPI *PFN_cuCtxAttach_v2000)(CUcontext *pctx, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxDetach_v2000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetExecAffinity_v11040)(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoad_v2000)(CUmodule *module, const char *fname);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoadData_v2000)(CUmodule *module, const void *image);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoadDataEx_v2010)(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoadFatBinary_v2000)(CUmodule *module, const void *fatCubin);
+typedef CUresult (CUDAAPI *PFN_cuModuleUnload_v2000)(CUmodule hmod);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetFunction_v2000)(CUfunction *hfunc, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v3020)(CUdeviceptr_v2 *dptr, size_t *bytes, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetTexRef_v2000)(CUtexref *pTexRef, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetSurfRef_v3000)(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetFunctionCount)(unsigned int *count, CUmodule hmod);
+typedef CUresult (CUDAAPI *PFN_cuModuleEnumerateFunctions)(CUfunction *functions, unsigned int numFunctions, CUmodule mod);
+typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v6050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v6050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v6050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult (CUDAAPI *PFN_cuLinkComplete_v5050)(CUlinkState state, void **cubinOut, size_t *sizeOut);
+typedef CUresult (CUDAAPI *PFN_cuLinkDestroy_v5050)(CUlinkState state);
+typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v3020)(size_t *free, size_t *total);
+typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v3020)(CUdeviceptr_v2 *dptr, size_t bytesize);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v3020)(CUdeviceptr_v2 *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
+typedef CUresult (CUDAAPI *PFN_cuMemFree_v3020)(CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v3020)(CUdeviceptr_v2 *pbase, size_t *psize, CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v3020)(void **pp, size_t bytesize);
+typedef CUresult (CUDAAPI *PFN_cuMemFreeHost_v2000)(void *p);
+typedef CUresult (CUDAAPI *PFN_cuMemHostAlloc_v2020)(void **pp, size_t bytesize, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v3020)(CUdeviceptr_v2 *pdptr, void *p, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuMemHostGetFlags_v2030)(unsigned int *pFlags, void *p);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocManaged_v6000)(CUdeviceptr_v2 *dptr, size_t bytesize, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetByPCIBusId_v4010)(CUdevice_v1 *dev, const char *pciBusId);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetPCIBusId_v4010)(char *pciBusId, int len, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuIpcGetEventHandle_v4010)(CUipcEventHandle_v1 *pHandle, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuIpcOpenEventHandle_v4010)(CUevent *phEvent, CUipcEventHandle_v1 handle);
+typedef CUresult (CUDAAPI *PFN_cuIpcGetMemHandle_v4010)(CUipcMemHandle_v1 *pHandle, CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v11000)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuIpcCloseMemHandle_v4010)(CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v6050)(void *p, size_t bytesize, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuMemHostUnregister_v4000)(void *p);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy_v7000_ptds)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v7000_ptds)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v7000_ptds)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v7000_ptds)(const CUDA_MEMCPY3D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v7000_ptds)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v7000_ptsz)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v7000_ptsz)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v7000_ptsz)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v7000_ptsz)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v7000_ptsz)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v7000_ptsz)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v7000_ptsz)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyBatchAsync_v12080_ptsz)(CUdeviceptr_v2 *dsts, CUdeviceptr_v2 *srcs, size_t *sizes, size_t count, CUmemcpyAttributes_v1 *attrs, size_t *attrIdxs, size_t numAttrs, size_t *failIdx, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DBatchAsync_v12080_ptsz)(size_t numParams, CUDA_MEMCPY3D_BATCH_OP_v1 *opList, size_t *failIdx, unsigned long long flags, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v2 *pAllocateArray);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v3020)(CUDA_ARRAY_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUarray array);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUmipmappedArray mipmap);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUarray array, CUdevice device);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUmipmappedArray mipmap, CUdevice device);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetPlane_v11020)(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx);
+typedef CUresult (CUDAAPI *PFN_cuArrayDestroy_v2000)(CUarray hArray);
+typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pAllocateArray);
+typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v3020)(CUDA_ARRAY3D_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayCreate_v5000)(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pMipmappedArrayDesc, unsigned int numMipmapLevels);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetLevel_v5000)(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayDestroy_v5000)(CUmipmappedArray hMipmappedArray);
+typedef CUresult (CUDAAPI *PFN_cuMemAddressReserve_v10020)(CUdeviceptr_v2 *ptr, size_t size, size_t alignment, CUdeviceptr_v2 addr, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemAddressFree_v10020)(CUdeviceptr_v2 ptr, size_t size);
+typedef CUresult (CUDAAPI *PFN_cuMemCreate_v10020)(CUmemGenericAllocationHandle_v1 *handle, size_t size, const CUmemAllocationProp_v1 *prop, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemRelease_v10020)(CUmemGenericAllocationHandle_v1 handle);
+typedef CUresult (CUDAAPI *PFN_cuMemMap_v10020)(CUdeviceptr_v2 ptr, size_t size, size_t offset, CUmemGenericAllocationHandle_v1 handle, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010_ptsz)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemUnmap_v10020)(CUdeviceptr_v2 ptr, size_t size);
+typedef CUresult (CUDAAPI *PFN_cuMemSetAccess_v10020)(CUdeviceptr_v2 ptr, size_t size, const CUmemAccessDesc_v1 *desc, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAccess_v10020)(unsigned long long *flags, const CUmemLocation_v1 *location, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuMemExportToShareableHandle_v10020)(void *shareableHandle, CUmemGenericAllocationHandle_v1 handle, CUmemAllocationHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemImportFromShareableHandle_v10020)(CUmemGenericAllocationHandle_v1 *handle, void *osHandle, CUmemAllocationHandleType shHandleType);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationGranularity_v10020)(size_t *granularity, const CUmemAllocationProp_v1 *prop, CUmemAllocationGranularity_flags option);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationPropertiesFromHandle_v10020)(CUmemAllocationProp_v1 *prop, CUmemGenericAllocationHandle_v1 handle);
+typedef CUresult (CUDAAPI *PFN_cuMemRetainAllocationHandle_v11000)(CUmemGenericAllocationHandle_v1 *handle, void *addr);
+typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020_ptsz)(CUdeviceptr_v2 dptr, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolTrimTo_v11020)(CUmemoryPool pool, size_t minBytesToKeep);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAccess_v11020)(CUmemoryPool pool, const CUmemAccessDesc_v1 *map, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAccess_v11020)(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation_v1 *location);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolCreate_v11020)(CUmemoryPool *pool, const CUmemPoolProps_v1 *poolProps);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolDestroy_v11020)(CUmemoryPool pool);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolExportToShareableHandle_v11020)(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolImportFromShareableHandle_v11020)(CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolExportPointer_v11020)(CUmemPoolPtrExportData_v1 *shareData_out, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolImportPointer_v11020)(CUdeviceptr_v2 *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData_v1 *shareData);
+typedef CUresult (CUDAAPI *PFN_cuPointerGetAttribute_v4000)(void *data, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000_ptsz)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAdvise_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUmem_advise advice, CUdevice_v1 device);
+typedef CUresult (CUDAAPI *PFN_cuMemAdvise_v12020)(CUdeviceptr_v2 devPtr, size_t count, CUmem_advise advice, CUmemLocation_v1 location);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v12020_ptsz)(CUdeviceptr_v2 devPtr, size_t count, CUmemLocation_v1 location, unsigned int flags, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttribute_v8000)(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr_v2 devPtr, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttributes_v8000)(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr_v2 devPtr, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMulticastCreate_v12010)(CUmemGenericAllocationHandle *mcHandle, const CUmulticastObjectProp *prop);
+typedef CUresult (CUDAAPI *PFN_cuMulticastAddDevice_v12010)(CUmemGenericAllocationHandle mcHandle, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuMulticastBindMem_v12010)(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUmemGenericAllocationHandle memHandle, size_t memOffset, size_t size, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMulticastBindAddr_v12010)(CUmemGenericAllocationHandle mcHandle, size_t mcOffset, CUdeviceptr memptr, size_t size, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMulticastUnbind_v12010)(CUmemGenericAllocationHandle mcHandle, CUdevice dev, size_t mcOffset, size_t size);
+typedef CUresult (CUDAAPI *PFN_cuMulticastGetGranularity_v12010)(size_t *granularity, const CUmulticastObjectProp *prop, CUmulticastGranularity_flags option);
+typedef CUresult (CUDAAPI *PFN_cuPointerSetAttribute_v6000)(const void *value, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuPointerGetAttributes_v7000)(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuStreamCreate_v2000)(CUstream *phStream, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamCreateWithPriority_v5050)(CUstream *phStream, unsigned int flags, int priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetId_v12000)(CUstream hStream, unsigned long long *streamId);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetId_v12000_ptsz)(CUstream hStream, unsigned long long *streamId);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v7000_ptsz)(CUstream hStream, int *priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v7000_ptsz)(CUstream hStream, unsigned int *flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020_ptsz)(CUstream hStream, CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v7000_ptsz)(CUstream hStream, CUevent hEvent, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v7000_ptsz)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010_ptsz)(CUstream hStream, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCaptureToGraph_v12030_ptsz)(CUstream hStream, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuThreadExchangeStreamCaptureMode_v10010)(CUstreamCaptureMode *mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000_ptsz)(CUstream hStream, CUgraph *phGraph);
+typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v12030_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, const CUgraphEdgeData **edgeData_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030_ptsz)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v12030_ptsz)(CUstream hStream, CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v7000_ptsz)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v7000_ptsz)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v7000_ptsz)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v4000)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000_ptsz)(CUstream dst, CUstream src);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *value);
+typedef CUresult (CUDAAPI *PFN_cuEventCreate_v2000)(CUevent *phEvent, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuEventRecord_v7000_ptsz)(CUevent hEvent, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010_ptsz)(CUevent hEvent, CUstream hStream, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuEventQuery_v2000)(CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuEventSynchronize_v2000)(CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v4000)(CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuEventElapsedTime_v2000)(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+typedef CUresult (CUDAAPI *PFN_cuEventElapsedTime_v12080)(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+typedef CUresult (CUDAAPI *PFN_cuImportExternalMemory_v10000)(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 *memHandleDesc);
+typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedBuffer_v10000)(CUdeviceptr_v2 *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 *bufferDesc);
+typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedMipmappedArray_v10000)(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 *mipmapDesc);
+typedef CUresult (CUDAAPI *PFN_cuDestroyExternalMemory_v10000)(CUexternalMemory extMem);
+typedef CUresult (CUDAAPI *PFN_cuImportExternalSemaphore_v10000)(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 *semHandleDesc);
+typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuDestroyExternalSemaphore_v10000)(CUexternalSemaphore extSem);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams_v1 *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetAttribute_v2020)(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetAttribute_v9000)(CUfunction hfunc, CUfunction_attribute attrib, int value);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetCacheConfig_v3000)(CUfunction hfunc, CUfunc_cache config);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedMemConfig_v4020)(CUfunction hfunc, CUsharedconfig config);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetName_v12030)(const char **name, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetParamInfo_v12040)(CUfunction func, size_t paramIndex, size_t *paramOffset, size_t *paramSize);
+typedef CUresult (CUDAAPI *PFN_cuFuncIsLoaded_v12040)(CUfunctionLoadingState *state, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuFuncLoad_v12040)(CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v7000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060_ptsz)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
+typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernelMultiDevice_v9000)(CUDA_LAUNCH_PARAMS_v1 *launchParamsList, unsigned int numDevices, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000_ptsz)(CUstream hStream, CUhostFn fn, void *userData);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetBlockShape_v2000)(CUfunction hfunc, int x, int y, int z);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedSize_v2000)(CUfunction hfunc, unsigned int bytes);
+typedef CUresult (CUDAAPI *PFN_cuParamSetSize_v2000)(CUfunction hfunc, unsigned int numbytes);
+typedef CUresult (CUDAAPI *PFN_cuParamSeti_v2000)(CUfunction hfunc, int offset, unsigned int value);
+typedef CUresult (CUDAAPI *PFN_cuParamSetf_v2000)(CUfunction hfunc, int offset, float value);
+typedef CUresult (CUDAAPI *PFN_cuParamSetv_v2000)(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
+typedef CUresult (CUDAAPI *PFN_cuLaunch_v2000)(CUfunction f);
+typedef CUresult (CUDAAPI *PFN_cuLaunchGrid_v2000)(CUfunction f, int grid_width, int grid_height);
+typedef CUresult (CUDAAPI *PFN_cuLaunchGridAsync_v2000)(CUfunction f, int grid_width, int grid_height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuParamSetTexRef_v2000)(CUfunction hfunc, int texunit, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuGraphCreate_v10000)(CUgraph *phGraph, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddKernelNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetParams_v10000)(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddKernelNode_v12000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetParams_v12000)(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetParams_v12000)(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemcpyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMCPY3D_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemsetNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddHostNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeGetParams_v10000)(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddChildGraphNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphChildGraphNodeGetGraph_v10000)(CUgraphNode hNode, CUgraph *phGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddEmptyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddEventRecordNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddEventWaitNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresSignalNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *params_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresWaitNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *params_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddBatchMemOpNode_v11070)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeGetParams_v11070)(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeSetParams_v11070)(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecBatchMemOpNodeSetParams_v11070)(CUgraphExec graphExec, CUgraphNode node, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphClone_v10000)(CUgraph *phGraphClone, CUgraph originalGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeFindInClone_v10000)(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetType_v10000)(CUgraphNode hNode, CUgraphNodeType *type);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetNodes_v10000)(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetRootNodes_v10000)(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetEdges_v10000)(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetEdges_v12030)(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, CUgraphEdgeData *edgeData, size_t *numEdges);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependencies_v10000)(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependencies_v12030)(CUgraphNode hNode, CUgraphNode *dependencies, CUgraphEdgeData *edgeData, size_t *numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependentNodes_v10000)(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependentNodes_v12030)(CUgraphNode hNode, CUgraphNode *dependentNodes, CUgraphEdgeData *edgeData, size_t *numDependentNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddDependencies_v12030)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, const CUgraphEdgeData *edgeData, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphRemoveDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphRemoveDependencies_v12030)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, const CUgraphEdgeData *edgeData, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphDestroyNode_v10000)(CUgraphNode hNode);
+typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithFlags_v11040)(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithParams_v12000_ptsz)(CUgraphExec *phGraphExec, CUgraph hGraph, CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecGetFlags_v12000)(CUgraphExec hGraphExec, cuuint64_t *flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecKernelNodeSetParams_v10010)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecKernelNodeSetParams_v12000)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecMemcpyNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecMemsetNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecHostNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecChildGraphNodeSetParams_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecEventRecordNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecEventWaitNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecDestroy_v10000)(CUgraphExec hGraphExec);
+typedef CUresult (CUDAAPI *PFN_cuGraphDestroy_v10000)(CUgraph hGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecUpdate_v10020)(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecUpdate_v12000)(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphExecUpdateResultInfo *resultInfo);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeCopyAttributes_v11000)(CUgraphNode dst, CUgraphNode src);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, CUkernelNodeAttrValue_v1 *value_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, const CUkernelNodeAttrValue_v1 *value);
+typedef CUresult (CUDAAPI *PFN_cuGraphDebugDotPrint_v11030)(CUgraph hGraph, const char *path, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemAllocNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemAllocNodeGetParams_v11040)(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemFreeNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemFreeNodeGetParams_v11040)(CUgraphNode hNode, CUdeviceptr *dptr_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeSetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddNode_v12020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraphNodeParams *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddNode_v12030)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUgraphNodeParams *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeSetParams_v12020)(CUgraphNode hNode, CUgraphNodeParams *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecNodeSetParams_v12020)(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraphNodeParams *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphConditionalHandleCreate_v12030)(CUgraphConditionalHandle *pHandle_out, CUgraph hGraph, CUcontext ctx, unsigned int defaultLaunchValue, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGraphMemTrim_v11040)(CUdevice device);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
+typedef CUresult (CUDAAPI *PFN_cuDeviceSetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSize_v6050)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020)(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialClusterSize_v11070)(int *clusterSize, CUfunction func, const CUlaunchConfig *config);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveClusters_v11070)(int *numClusters, CUfunction func, const CUlaunchConfig *config);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetArray_v2000)(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmappedArray_v5000)(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v3020)(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr_v2 dptr, size_t bytes);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v4010)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetFormat_v2000)(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddressMode_v2000)(CUtexref hTexRef, int dim, CUaddress_mode am);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetFilterMode_v2000)(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapFilterMode_v5000)(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelBias_v5000)(CUtexref hTexRef, float bias);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelClamp_v5000)(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMaxAnisotropy_v5000)(CUtexref hTexRef, unsigned int maxAniso);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetBorderColor_v8000)(CUtexref hTexRef, float *pBorderColor);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetFlags_v2000)(CUtexref hTexRef, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v3020)(CUdeviceptr_v2 *pdptr, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetArray_v2000)(CUarray *phArray, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmappedArray_v5000)(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddressMode_v2000)(CUaddress_mode *pam, CUtexref hTexRef, int dim);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetFilterMode_v2000)(CUfilter_mode *pfm, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetFormat_v2000)(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapFilterMode_v5000)(CUfilter_mode *pfm, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelBias_v5000)(float *pbias, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelClamp_v5000)(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMaxAnisotropy_v5000)(int *pmaxAniso, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetBorderColor_v8000)(float *pBorderColor, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetFlags_v2000)(unsigned int *pFlags, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefCreate_v2000)(CUtexref *pTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefDestroy_v2000)(CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuSurfRefSetArray_v3000)(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuSurfRefGetArray_v3000)(CUarray *phArray, CUsurfref hSurfRef);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectCreate_v5000)(CUtexObject_v1 *pTexObject, const CUDA_RESOURCE_DESC_v1 *pResDesc, const CUDA_TEXTURE_DESC_v1 *pTexDesc, const CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectDestroy_v5000)(CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectGetTextureDesc_v5000)(CUDA_TEXTURE_DESC_v1 *pTexDesc, CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceViewDesc_v5000)(CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc, CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuSurfObjectCreate_v5000)(CUsurfObject_v1 *pSurfObject, const CUDA_RESOURCE_DESC_v1 *pResDesc);
+typedef CUresult (CUDAAPI *PFN_cuSurfObjectDestroy_v5000)(CUsurfObject_v1 surfObject);
+typedef CUresult (CUDAAPI *PFN_cuSurfObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUsurfObject_v1 surfObject);
+typedef CUresult (CUDAAPI *PFN_cuTensorMapEncodeTiled_v12000)(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const cuuint32_t *boxDim, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+typedef CUresult (CUDAAPI *PFN_cuTensorMapEncodeIm2col_v12000)(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, const int *pixelBoxLowerCorner, const int *pixelBoxUpperCorner, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+typedef CUresult (CUDAAPI *PFN_cuTensorMapReplaceAddress_v12000)(CUtensorMap *tensorMap, void *globalAddress);
+typedef CUresult (CUDAAPI *PFN_cuTensorMapEncodeIm2colWide_v12080)(CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType, cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim, const cuuint64_t *globalStrides, int pixelBoxLowerCornerWidth, int pixelBoxUpperCornerWidth, cuuint32_t channelsPerPixel, cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides, CUtensorMapInterleave interleave, CUtensorMapIm2ColWideMode mode, CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+typedef CUresult (CUDAAPI *PFN_cuDeviceCanAccessPeer_v4000)(int *canAccessPeer, CUdevice_v1 dev, CUdevice_v1 peerDev);
+typedef CUresult (CUDAAPI *PFN_cuCtxEnablePeerAccess_v4000)(CUcontext peerContext, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxDisablePeerAccess_v4000)(CUcontext peerContext);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetP2PAttribute_v8000)(int *value, CUdevice_P2PAttribute attrib, CUdevice_v1 srcDevice, CUdevice_v1 dstDevice);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsUnregisterResource_v3000)(CUgraphicsResource resource);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsSubResourceGetMappedArray_v3000)(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000)(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, size_t *pSize, CUgraphicsResource resource);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v6050)(CUgraphicsResource resource, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGetExportTable_v3000)(const void **ppExportTable, const CUuuid *pExportTableId);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetModule_v11000)(CUmodule *hmod, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
+typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v12000)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags, CUdriverProcAddressQueryResult *symbolFound);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v3020)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v3020)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v3020)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v3020)(const CUDA_MEMCPY3D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v3020)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v3020)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyBatchAsync_v12080)(CUdeviceptr_v2 *dsts, CUdeviceptr_v2 *srcs, size_t *sizes, size_t count, CUmemcpyAttributes_v1 *attrs, size_t *attrIdxs, size_t numAttrs, size_t *failIdx, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DBatchAsync_v12080)(size_t numParams, CUDA_MEMCPY3D_BATCH_OP_v1 *opList, size_t *failIdx, unsigned long long flags, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v5050)(CUstream hStream, int *priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v5050)(CUstream hStream, unsigned int *flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020)(CUstream hStream, CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetDevice_v12080)(CUstream hStream, CUdevice *device);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetDevice_v12080_ptsz)(CUstream hStream, CUdevice *device);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v3020)(CUstream hStream, CUevent hEvent, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v5000)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v6000)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v2000)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v2000)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuEventRecord_v2000)(CUevent hEvent, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010)(CUevent hEvent, CUstream hStream, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v4000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000)(CUstream hStream, CUhostFn fn, void *userData);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v12020)(CUdeviceptr_v2 devPtr, size_t count, CUmemLocation_v1 location, unsigned int flags, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
+typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010)(CUstream hStream, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCaptureToGraph_v12030)(CUstream hStream, CUgraph hGraph, const CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000)(CUstream hStream, CUgraph *phGraph);
+typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v12030)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, const CUgraphEdgeData **edgeData_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v12030)(CUstream hStream, CUgraphNode *dependencies, const CUgraphEdgeData *dependencyData, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithParams_v12000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010)(CUgraphExec hGraph, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000)(CUgraphExec hGraph, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000)(CUstream dstStream, CUstream srcStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value);
+typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *param);
+typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020)(CUdeviceptr_v2 dptr, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuFlushGPUDirectRDMAWrites_v11030)(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope);
+typedef CUresult (CUDAAPI *PFN_cuUserObjectCreate_v11030)(CUuserObject *object_out, void *ptr, CUhostFn destroy, unsigned int initialRefcount, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuUserObjectRetain_v11030)(CUuserObject object, unsigned int count);
+typedef CUresult (CUDAAPI *PFN_cuUserObjectRelease_v11030)(CUuserObject object, unsigned int count);
+typedef CUresult (CUDAAPI *PFN_cuGraphRetainUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphReleaseUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetLoadingMode_v11070)(CUmoduleLoadingMode *mode);
+typedef CUresult (CUDAAPI *PFN_cuMemGetHandleForAddressRange_v11070)(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuLibraryLoadData_v12000)(CUlibrary *library, const void *code, CUjit_option *jitOptions, void **jitOptionsValues, unsigned int numJitOptions, CUlibraryOption *libraryOptions, void** libraryOptionValues, unsigned int numLibraryOptions);
+typedef CUresult (CUDAAPI *PFN_cuLibraryLoadFromFile_v12000)(CUlibrary *library, const char *fileName, CUjit_option *jitOptions, void **jitOptionsValues, unsigned int numJitOptions, CUlibraryOption *libraryOptions, void **libraryOptionValues, unsigned int numLibraryOptions);
+typedef CUresult (CUDAAPI *PFN_cuLibraryUnload_v12000)(CUlibrary library);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetKernel_v12000)(CUkernel *pKernel, CUlibrary library, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetModule_v12000)(CUmodule *pMod, CUlibrary library);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetKernelCount)(unsigned int *count, CUlibrary lib);
+typedef CUresult (CUDAAPI *PFN_cuLibraryEnumerateKernels)(CUkernel *kernels, unsigned int numKernels, CUlibrary lib);
+typedef CUresult (CUDAAPI *PFN_cuKernelGetFunction_v12000)(CUfunction *pFunc, CUkernel kernel);
+typedef CUresult (CUDAAPI *PFN_cuKernelGetLibrary_v12050)(CUlibrary *pLib, CUkernel kernel);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetGlobal_v12000)(CUdeviceptr *dptr, size_t *bytes, CUlibrary library, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetManaged_v12000)(CUdeviceptr *dptr, size_t *bytes, CUlibrary library, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuKernelGetAttribute_v12000)(int *pi, CUfunction_attribute attrib, CUkernel kernel, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuKernelSetAttribute_v12000)(CUfunction_attribute attrib, int val, CUkernel kernel, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuKernelSetCacheConfig_v12000)(CUkernel kernel, CUfunc_cache config, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuKernelGetName_v12030)(const char **name, CUkernel hfunc);
+typedef CUresult (CUDAAPI *PFN_cuKernelGetParamInfo_v12040)(CUkernel kernel, size_t paramIndex, size_t *paramOffset, size_t *paramSize);
+typedef CUresult (CUDAAPI *PFN_cuLibraryGetUnifiedFunction_v12000)(void **fptr, CUlibrary library, const char *symbol);
+typedef CUresult(CUDAAPI *PFN_cuCoredumpGetAttribute_v12010)(CUcoredumpSettings get, void *value, size_t *size);
+typedef CUresult(CUDAAPI *PFN_cuCoredumpGetAttributeGlobal_v12010)(CUcoredumpSettings get, void *value, size_t *size);
+typedef CUresult(CUDAAPI *PFN_cuCoredumpSetAttribute_v12010)(CUcoredumpSettings set, void *value, size_t *size);
+typedef CUresult(CUDAAPI *PFN_cuCoredumpSetAttributeGlobal_v12010)(CUcoredumpSettings set, void *value, size_t *size);
+typedef CUresult(CUDAAPI *PFN_cuDeviceRegisterAsyncNotification_v12040)(CUdevice device, CUasyncCallback callbackFunc, void *userData, CUasyncCallbackHandle *callback);
+typedef CUresult(CUDAAPI *PFN_cuDeviceUnregisterAsyncNotification_v12040)(CUdevice device, CUasyncCallbackHandle callback);
+typedef CUresult(CUDAAPI *PFN_cuGreenCtxCreate_v12040)(CUgreenCtx* phCtx, CUdevResourceDesc desc, CUdevice dev, unsigned int flags);
+typedef CUresult(CUDAAPI *PFN_cuGreenCtxDestroy_v12040)(CUgreenCtx hCtx);
+typedef CUresult(CUDAAPI *PFN_cuDeviceGetDevResource_v12040)(CUdevice dev, CUdevResource* result, CUdevResourceType type);
+typedef CUresult(CUDAAPI *PFN_cuCtxGetDevResource_v12040)(CUcontext hCtx, CUdevResource* result, CUdevResourceType type);
+typedef CUresult(CUDAAPI *PFN_cuGreenCtxGetDevResource_v12040)(CUgreenCtx hCtx, CUdevResource* result, CUdevResourceType type);
+typedef CUresult(CUDAAPI *PFN_cuGreenCtxRecordEvent_v12040)(CUgreenCtx hCtx, CUevent hEvent);
+typedef CUresult(CUDAAPI *PFN_cuGreenCtxWaitEvent_v12040)(CUgreenCtx hCtx, CUevent hEvent);
+typedef CUresult(CUDAAPI *PFN_cuDevResourceGenerateDesc_v12040)(CUdevResourceDesc* phDesc, CUdevResource* resources, unsigned int nbResources);
+typedef CUresult(CUDAAPI *PFN_cuDevSmResourceSplitByCount_v12040)(CUdevResource* result, unsigned int* nbGroups, const CUdevResource* input, CUdevResource* remaining, unsigned int useFlags, unsigned int minCount);
+typedef CUresult(CUDAAPI *PFN_cuStreamGetGreenCtx_v12040)(CUstream hStream, CUgreenCtx *phCtx);
+typedef CUresult(CUDAAPI *PFN_cuCtxFromGreenCtx_v12040)(CUcontext *pContext, CUgreenCtx hCtx);
+typedef CUresult(CUDAAPI *PFN_cuCtxRecordEvent_v12050)(CUcontext hCtx, CUevent hEvent);
+typedef CUresult(CUDAAPI *PFN_cuCtxWaitEvent_v12050)(CUcontext hCtx, CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuGreenCtxStreamCreate_v12050)(CUstream* phStream, CUgreenCtx greenCtx, unsigned int flags, int priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v12050)(CUstream hStream, CUcontext *pctx, CUgreenCtx *pGreenCtx);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v12050_ptsz)(CUstream hStream, CUcontext *pctx, CUgreenCtx *pGreenCtx);
+    typedef CUresult (CUDAAPI *PFN_cuMemBatchDecompressAsync_v12060)(CUmemDecompressParams *paramsArray, size_t count, unsigned int flags, size_t *errorIndex, CUstream stream);
+    typedef CUresult (CUDAAPI *PFN_cuMemBatchDecompressAsync_v12060_ptsz)(CUmemDecompressParams *paramsArray, size_t count, unsigned int flags, size_t *errorIndex, CUstream stream);
+/*
+ * Type definitions for older versioned functions in cuda.h
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v4000)(void *p, size_t bytesize, unsigned int Flags);
+    typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v3000)(CUgraphicsResource resource, unsigned int flags);
+    typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v5050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+    typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v5050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
+    typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v5050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v3020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
+    typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v2000)(unsigned int *bytes, CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v2000)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v2000)(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
+    typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v2000)(unsigned int *free, unsigned int *total);
+    typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v2000)(CUdeviceptr_v1 *dptr, unsigned int bytesize);
+    typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v2000)(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
+    typedef CUresult (CUDAAPI *PFN_cuMemFree_v2000)(CUdeviceptr_v1 dptr);
+    typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v2000)(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr);
+    typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v2000)(void **pp, unsigned int bytesize);
+    typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v2020)(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v2000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v2000)(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v2000)(const CUDA_MEMCPY3D_v1 *pCopy);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v2000)(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v2000)(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v2000)(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v2000)(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
+    typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray);
+    typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v2000)(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray);
+    typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v2000)(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v2000)(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v2020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v2000)(CUdeviceptr_v1 *pdptr, CUtexref hTexRef);
+    typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3000)(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
+    typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v2000)(CUcontext ctx);
+    typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v2000)(CUcontext *pctx);
+    typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v2000)(CUcontext ctx);
+    typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v2000)(CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v2000)(CUevent hEvent);
+    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v7000)(CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v7000)(CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v7000)(CUdevice_v1 dev, unsigned int flags);
+    typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000)(CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000_ptsz)(CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v4010)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
+    typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v10000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+    typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v11000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+#endif
+
+    typedef CUresult (CUDAAPI *PFN_cuCheckpointProcessGetRestoreThreadId_v12080)(int pid, int *tid);
+    typedef CUresult (CUDAAPI *PFN_cuCheckpointProcessGetState_v12080)(int pid, CUprocessState *state);
+    typedef CUresult (CUDAAPI *PFN_cuCheckpointProcessLock_v12080)(int pid, CUcheckpointLockArgs *args);
+    typedef CUresult (CUDAAPI *PFN_cuCheckpointProcessCheckpoint_v12080)(int pid, CUcheckpointCheckpointArgs *args);
+    typedef CUresult (CUDAAPI *PFN_cuCheckpointProcessRestore_v12080)(int pid, CUcheckpointRestoreArgs *args);
+    typedef CUresult (CUDAAPI *PFN_cuCheckpointProcessUnlock_v12080)(int pid, CUcheckpointUnlockArgs *args);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaVDPAU.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaVDPAU.h
new file mode 100644
index 0000000000000000000000000000000000000000..97de57ae494d62ae176fc02ad3c0c3f4d43e1526
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaVDPAU.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAVDPAU_H
+#define CUDAVDPAU_H
+
+#ifdef CUDA_FORCE_API_VERSION
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+
+#define cuVDPAUCtxCreate cuVDPAUCtxCreate_v2
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \defgroup CUDA_VDPAU VDPAU Interoperability
+ * \ingroup CUDA_DRIVER
+ *
+ * ___MANBRIEF___ VDPAU interoperability functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the VDPAU interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Gets the CUDA device associated with a VDPAU device
+ *
+ * Returns in \p *pDevice the CUDA device associated with a \p vdpDevice, if
+ * applicable.
+ *
+ * \param pDevice           - Device associated with vdpDevice
+ * \param vdpDevice         - A VdpDevice handle
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cudaVDPAUGetDevice
+ */
+CUresult CUDAAPI cuVDPAUGetDevice(CUdevice *pDevice, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+
+/**
+ * \brief Create a CUDA context for interoperability with VDPAU
+ *
+ * Creates a new CUDA context, initializes VDPAU interoperability, and
+ * associates the CUDA context with the calling thread. It must be called
+ * before performing any other VDPAU interoperability operations. It may fail
+ * if the needed VDPAU driver facilities are not available. For usage of the
+ * \p flags parameter, see ::cuCtxCreate().
+ *
+ * \param pCtx              - Returned CUDA context
+ * \param flags             - Options for CUDA context creation
+ * \param device            - Device on which to create the context
+ * \param vdpDevice         - The VdpDevice to interop with
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuVDPAUGetDevice
+ */
+CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+
+/**
+ * \brief Registers a VDPAU VdpVideoSurface object
+ *
+ * Registers the VdpVideoSurface specified by \p vdpSurface for access by
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * The VdpVideoSurface is presented as an array of subresources that may be
+ * accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
+ * The exact number of valid \p arrayIndex values depends on the VDPAU surface
+ * format. The mapping is shown in the table below. \p mipLevel must be 0.
+ *
+ * \htmlonly
+ * <table>
+ * <tr><th>VdpChromaType                               </th><th>arrayIndex</th><th>Size     </th><th>Format</th><th>Content            </th></tr>
+ * <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_420</td><td>0         </td><td>w   x h/2</td><td>R8    </td><td>Top-field luma     </td></tr>
+ * <tr>                                                     <td>1         </td><td>w   x h/2</td><td>R8    </td><td>Bottom-field luma  </td></tr>
+ * <tr>                                                     <td>2         </td><td>w/2 x h/4</td><td>R8G8  </td><td>Top-field chroma   </td></tr>
+ * <tr>                                                     <td>3         </td><td>w/2 x h/4</td><td>R8G8  </td><td>Bottom-field chroma</td></tr>
+ * <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_422</td><td>0         </td><td>w   x h/2</td><td>R8    </td><td>Top-field luma     </td></tr>
+ * <tr>                                                     <td>1         </td><td>w   x h/2</td><td>R8    </td><td>Bottom-field luma  </td></tr>
+ * <tr>                                                     <td>2         </td><td>w/2 x h/2</td><td>R8G8  </td><td>Top-field chroma   </td></tr>
+ * <tr>                                                     <td>3         </td><td>w/2 x h/2</td><td>R8G8  </td><td>Bottom-field chroma</td></tr>
+ * </table>
+ * \endhtmlonly
+ *
+ * \latexonly
+ * \begin{tabular}{|l|l|l|l|l|}
+ * \hline
+ * VdpChromaType          & arrayIndex & Size      & Format & Content             \\
+ * \hline
+ * VDP\_CHROMA\_TYPE\_420 & 0          & w x h/2   & R8     & Top-field luma      \\
+ *                        & 1          & w x h/2   & R8     & Bottom-field luma   \\
+ *                        & 2          & w/2 x h/4 & R8G8   & Top-field chroma    \\
+ *                        & 3          & w/2 x h/4 & R8G8   & Bottom-field chroma \\
+ * \hline
+ * VDP\_CHROMA\_TYPE\_422 & 0          & w x h/2   & R8     & Top-field luma      \\
+ *                        & 1          & w x h/2   & R8     & Bottom-field luma   \\
+ *                        & 2          & w/2 x h/2 & R8G8   & Top-field chroma    \\
+ *                        & 3          & w/2 x h/2 & R8G8   & Bottom-field chroma \\
+ * \hline
+ * \end{tabular}
+ * \endlatexonly
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param vdpSurface    - The VdpVideoSurface to be registered
+ * \param flags         - Map flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuVDPAUGetDevice,
+ * ::cudaGraphicsVDPAURegisterVideoSurface
+ */
+CUresult CUDAAPI cuGraphicsVDPAURegisterVideoSurface(CUgraphicsResource *pCudaResource, VdpVideoSurface vdpSurface, unsigned int flags);
+
+/**
+ * \brief Registers a VDPAU VdpOutputSurface object
+ *
+ * Registers the VdpOutputSurface specified by \p vdpSurface for access by
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * The VdpOutputSurface is presented as an array of subresources that may be
+ * accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
+ * The exact number of valid \p arrayIndex values depends on the VDPAU surface
+ * format. The mapping is shown in the table below. \p mipLevel must be 0.
+ *
+ * \htmlonly
+ * <table>
+ * <tr><th>VdpRGBAFormat              </th><th>arrayIndex</th><th>Size </th><th>Format </th><th>Content       </th></tr>
+ * <tr><td>VDP_RGBA_FORMAT_B8G8R8A8   </td><td>0         </td><td>w x h</td><td>ARGB8  </td><td>Entire surface</td></tr>
+ * <tr><td>VDP_RGBA_FORMAT_R10G10B10A2</td><td>0         </td><td>w x h</td><td>A2BGR10</td><td>Entire surface</td></tr>
+ * </table>
+ * \endhtmlonly
+ *
+ * \latexonly
+ * \begin{tabular}{|l|l|l|l|l|}
+ * \hline
+ * VdpRGBAFormat                  & arrayIndex & Size  & Format  & Content        \\
+ * \hline
+ * VDP\_RGBA\_FORMAT\_B8G8R8A8    & 0          & w x h & ARGB8   & Entire surface \\
+ * VDP\_RGBA\_FORMAT\_R10G10B10A2 & 0          & w x h & A2BGR10 & Entire surface \\
+ * \hline
+ * \end{tabular}
+ * \endlatexonly
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param vdpSurface    - The VdpOutputSurface to be registered
+ * \param flags         - Map flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
+ * ::cuGraphicsVDPAURegisterVideoSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuVDPAUGetDevice,
+ * ::cudaGraphicsVDPAURegisterOutputSurface
+ */
+CUresult CUDAAPI cuGraphicsVDPAURegisterOutputSurface(CUgraphicsResource *pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags);
+
+/** @} */ /* END CUDA_VDPAU */
+
+
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuVDPAUCtxCreate
+
+    CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+#endif /* __CUDA_API_VERSION_INTERNAL */
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaVDPAUTypedefs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaVDPAUTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..2bfd148632827d222548be49b3a2ffb7caa1c4dc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaVDPAUTypedefs.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAVDPAUTYPEDEFS_H
+#define CUDAVDPAUTYPEDEFS_H
+
+// Dependent includes for cudavdpau.h
+#include <vdpau/vdpau.h>
+
+#include <cudaVDPAU.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cudaVDPAU.h
+ */
+#define PFN_cuVDPAUGetDevice  PFN_cuVDPAUGetDevice_v3010
+#define PFN_cuVDPAUCtxCreate  PFN_cuVDPAUCtxCreate_v3020
+#define PFN_cuGraphicsVDPAURegisterVideoSurface  PFN_cuGraphicsVDPAURegisterVideoSurface_v3010
+#define PFN_cuGraphicsVDPAURegisterOutputSurface  PFN_cuGraphicsVDPAURegisterOutputSurface_v3010
+
+
+/**
+ * Type definitions for functions defined in cudaVDPAU.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuVDPAUGetDevice_v3010)(CUdevice_v1 *pDevice, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+typedef CUresult (CUDAAPI *PFN_cuVDPAUCtxCreate_v3020)(CUcontext *pCtx, unsigned int flags, CUdevice_v1 device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsVDPAURegisterVideoSurface_v3010)(CUgraphicsResource *pCudaResource, VdpVideoSurface vdpSurface, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsVDPAURegisterOutputSurface_v3010)(CUgraphicsResource *pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags);
+
+/*
+ * Type definitions for older versioned functions in cudaVDPAU.h
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+typedef CUresult (CUDAAPI *PFN_cuVDPAUCtxCreate_v3010)(CUcontext *pCtx, unsigned int flags, CUdevice_v1 device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+#endif
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_awbarrier.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_awbarrier.h
new file mode 100644
index 0000000000000000000000000000000000000000..12fd878dd10d9f18ad944a0d62ae1caba123fd06
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_awbarrier.h
@@ -0,0 +1,280 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_AWBARRIER_H_
+# define _CUDA_AWBARRIER_H_
+
+# include "cuda_awbarrier_primitives.h"
+
+# if !defined(_CUDA_AWBARRIER_SM_TARGET)
+#  error This file requires compute capability 7.0 or greater.
+# endif
+
+# if !defined(_CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER)
+#  error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+             -std=c++11 compiler option.
+# endif
+
+_CUDA_AWBARRIER_BEGIN_NAMESPACE
+
+class awbarrier {
+public:
+    class arrival_token {
+    public:
+        arrival_token() = default;
+        ~arrival_token() = default;
+        _CUDA_AWBARRIER_QUALIFIER uint32_t pending_count() const;
+    private:
+        _CUDA_AWBARRIER_QUALIFIER arrival_token(uint64_t token);
+        uint64_t token;
+        friend awbarrier;
+    };
+    awbarrier() = default;
+    awbarrier(const awbarrier&) = delete;
+    awbarrier& operator=(const awbarrier&) = delete;
+    ~awbarrier() = default;
+
+    _CUDA_AWBARRIER_QUALIFIER arrival_token arrive();
+    _CUDA_AWBARRIER_QUALIFIER arrival_token arrive_and_drop();
+    _CUDA_AWBARRIER_QUALIFIER bool timed_wait(arrival_token token, uint32_t hint_cycles);
+    _CUDA_AWBARRIER_QUALIFIER bool timed_wait_parity(bool phase, uint32_t hint_cycles);
+    _CUDA_AWBARRIER_QUALIFIER void wait(arrival_token token);
+    _CUDA_AWBARRIER_QUALIFIER void arrive_and_wait();
+    _CUDA_AWBARRIER_QUALIFIER bool try_wait(arrival_token token, uint32_t maxSleepNanosec);
+    _CUDA_AWBARRIER_QUALIFIER bool try_wait_parity(bool phase, uint32_t maxSleepNanosec);
+    _CUDA_AWBARRIER_STATIC_QUALIFIER __host__ constexpr uint32_t max();
+
+private:
+    uint64_t barrier;
+    friend _CUDA_AWBARRIER_QUALIFIER void init(awbarrier* barrier, uint32_t expected_count);
+    friend _CUDA_AWBARRIER_QUALIFIER void inval(awbarrier* barrier);
+    friend class pipeline;
+};
+
+_CUDA_AWBARRIER_QUALIFIER
+uint32_t awbarrier::arrival_token::pending_count() const
+{
+    const uint32_t pending_count = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(this->token);
+#if (__CUDA_ARCH__ >= 900)
+    return pending_count;
+#else
+    return (pending_count >> 15);
+#endif
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+awbarrier::arrival_token::arrival_token(uint64_t token)
+    : token(token)
+{
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void init(awbarrier* barrier, uint32_t expected_count)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count <= _CUDA_AWBARRIER_MAX_COUNT);
+
+#if (__CUDA_ARCH__ >= 900)
+    const uint32_t init_count = expected_count;
+#else
+    const uint32_t init_count = (expected_count << 15) + expected_count;
+#endif
+
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(&barrier->barrier, init_count);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void inval(awbarrier* barrier)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(&barrier->barrier);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+awbarrier::arrival_token awbarrier::arrive()
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+ #if (__CUDA_ARCH__ < 900)
+    const uint32_t arrive_count = 1 << 15;
+    const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<false>(&this->barrier, arrive_count);
+    (void)
+#else
+    const uint64_t token =
+ #endif
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(&this->barrier);
+
+    return arrival_token(token);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+awbarrier::arrival_token awbarrier::arrive_and_drop()
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+ #if (__CUDA_ARCH__ < 900)
+    const uint32_t arrive_count = 1 << 15;
+    const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<true>(&this->barrier, arrive_count);
+    (void)
+#else
+    const uint64_t token =
+ #endif
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(&this->barrier);
+
+    return arrival_token(token);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+bool awbarrier::timed_wait(arrival_token token, uint32_t hint_cycles)
+{
+    constexpr uint64_t max_busy_wait_cycles = 1024;
+    constexpr uint32_t max_sleep_ns = 1 << 20;
+
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
+        return true;
+    }
+
+    uint64_t start_cycles = clock64();
+    uint64_t elapsed_cycles = 0;
+    uint32_t sleep_ns = 32;
+    while (elapsed_cycles < hint_cycles) {
+        if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
+            return true;
+        }
+
+        if (elapsed_cycles > max_busy_wait_cycles) {
+            __nanosleep(sleep_ns);
+            if (sleep_ns < max_sleep_ns) {
+                sleep_ns *= 2;
+            }
+        }
+
+        elapsed_cycles = clock64() - start_cycles;
+    }
+
+    return false;
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+bool awbarrier::timed_wait_parity(bool phase, uint32_t hint_cycles)
+{
+    constexpr uint64_t max_busy_wait_cycles = 1024;
+    constexpr uint32_t max_sleep_ns = 1 << 20;
+
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait_parity(&this->barrier, phase)) {
+        return true;
+    }
+
+    uint64_t start_cycles = clock64();
+    uint64_t elapsed_cycles = 0;
+    uint32_t sleep_ns = 32;
+    while (elapsed_cycles < hint_cycles) {
+        if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait_parity(&this->barrier, phase)) {
+            return true;
+        }
+
+        if (elapsed_cycles > max_busy_wait_cycles) {
+            __nanosleep(sleep_ns);
+            if (sleep_ns < max_sleep_ns) {
+                sleep_ns *= 2;
+            }
+        }
+
+        elapsed_cycles = clock64() - start_cycles;
+    }
+
+    return false;
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+bool awbarrier::try_wait(arrival_token token, uint32_t maxSleepNanosec)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_try_wait(&this->barrier, token.token, maxSleepNanosec);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+bool awbarrier::try_wait_parity(bool phase, uint32_t maxSleepNanosec)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_try_wait_parity(&this->barrier, phase, maxSleepNanosec);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier::wait(arrival_token token)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    while (!timed_wait(token, ~0u));
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier::arrive_and_wait()
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    this->wait(this->arrive());
+}
+
+_CUDA_AWBARRIER_QUALIFIER __host__
+constexpr uint32_t awbarrier::max()
+{
+    return _CUDA_AWBARRIER_MAX_COUNT;
+}
+
+_CUDA_AWBARRIER_END_NAMESPACE
+
+#endif /* !_CUDA_AWBARRIER_H_ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_awbarrier_helpers.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_awbarrier_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c58346fe78c59329aca138ebc92add9015c005c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_awbarrier_helpers.h
@@ -0,0 +1,365 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_AWBARRIER_HELPERS_H_
+#define _CUDA_AWBARRIER_HELPERS_H_
+
+#define _CUDA_AWBARRIER_NAMESPACE       nvcuda::experimental
+#define _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
+#define _CUDA_AWBARRIER_END_NAMESPACE   } }
+
+#define _CUDA_AWBARRIER_INTERNAL_NAMESPACE       _CUDA_AWBARRIER_NAMESPACE::__awbarrier_internal
+#define _CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace __awbarrier_internal {
+#define _CUDA_AWBARRIER_END_INTERNAL_NAMESPACE   } _CUDA_AWBARRIER_END_NAMESPACE
+
+# if !defined(_CUDA_AWBARRIER_QUALIFIER)
+#  define _CUDA_AWBARRIER_QUALIFIER inline __device__
+# endif
+# if !defined(_CUDA_AWBARRIER_STATIC_QUALIFIER)
+#  define _CUDA_AWBARRIER_STATIC_QUALIFIER static inline __device__
+#endif
+
+#if defined(__CUDA_ARCH__)
+#if (__CUDA_ARCH__ >= 900)
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_90
+#elif  (__CUDA_ARCH__ >= 800)
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_80
+#elif (__CUDA_ARCH__ >= 700)
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
+#endif
+#else
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
+#endif
+
+#define _CUDA_AWBARRIER_MAX_COUNT ((1 << 14) - 1)
+
+#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
+# define _CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER
+#endif
+
+#if !defined(_CUDA_AWBARRIER_DEBUG)
+# if defined(__CUDACC_DEBUG__)
+#  define _CUDA_AWBARRIER_DEBUG 1
+# else
+#  define _CUDA_AWBARRIER_DEBUG 0
+# endif
+#endif
+
+#if defined(_CUDA_AWBARRIER_DEBUG) && (_CUDA_AWBARRIER_DEBUG == 1) && !defined(NDEBUG)
+# if !defined(__CUDACC_RTC__)
+#  include <cassert>
+# endif
+# define _CUDA_AWBARRIER_ASSERT(x) assert((x));
+# define _CUDA_AWBARRIER_ABORT() assert(0);
+#else
+# define _CUDA_AWBARRIER_ASSERT(x)
+# define _CUDA_AWBARRIER_ABORT() __trap();
+#endif
+
+#if defined(__CUDACC_RTC__)
+typedef unsigned short     uint16_t;
+typedef unsigned int       uint32_t;
+typedef unsigned long long uint64_t;
+typedef uint64_t           uintptr_t;
+#else
+# include <stdint.h>
+#endif
+
+// implicitly provided by NVRTC
+#ifndef __CUDACC_RTC__
+#include <nv/target>
+#endif /* !defined(__CUDACC_RTC__) */
+
+typedef uint64_t __mbarrier_t;
+typedef uint64_t __mbarrier_token_t;
+
+_CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE
+
+extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
+
+union AWBarrier {
+    struct {
+        uint32_t expected;
+        uint32_t pending;
+    } split;
+    uint64_t raw;
+};
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+void awbarrier_init(uint64_t* barrier, uint32_t expected_count) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
+
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+        asm volatile ("mbarrier.init.shared.b64 [%0], %1;"
+                :
+                : "r"(__nvvm_get_smem_pointer(barrier)), "r"(expected_count)
+                : "memory");
+        return;
+    )
+    NV_IF_TARGET(NV_PROVIDES_SM_70,
+        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
+
+        awbarrier->split.expected = 0x40000000 - expected_count;
+        awbarrier->split.pending = 0x80000000 - expected_count;
+        return;
+    )
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+void awbarrier_inval(uint64_t* barrier) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+        asm volatile ("mbarrier.inval.shared.b64 [%0];"
+                :
+                : "r"(__nvvm_get_smem_pointer(barrier))
+                : "memory");
+        return;
+    )
+    return;
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+uint32_t awbarrier_token_pending_count(uint64_t token) {
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+        uint32_t __pending_count;
+
+        asm ("mbarrier.pending_count.b64 %0, %1;"
+                : "=r"(__pending_count)
+                : "l"(token));
+        return __pending_count;
+    )
+    NV_IF_TARGET(NV_PROVIDES_SM_70,
+        const uint32_t pending = token >> 32;
+        return 0x80000000 - (pending & 0x7fffffff);
+    )
+}
+
+template<bool _Drop>
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+uint64_t awbarrier_arrive_drop(uint64_t* barrier) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+        uint64_t token;
+
+        if (_Drop) {
+            asm volatile ("mbarrier.arrive_drop.shared.b64 %0, [%1];"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier))
+                    : "memory");
+        } else {
+            asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier))
+                    : "memory");
+        }
+
+        return token;
+    )
+    NV_IF_TARGET(NV_PROVIDES_SM_70,
+        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
+
+        while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
+
+        if (_Drop) {
+            (void)atomicAdd_block(&awbarrier->split.expected, 1);
+        }
+
+        __threadfence_block();
+
+        const uint32_t old_pending = atomicAdd_block(&awbarrier->split.pending, 1);
+        const uint32_t new_pending = old_pending + 1;
+        const bool reset = (old_pending ^ new_pending) & 0x80000000;
+
+        if (reset) {
+            __threadfence_block();
+
+            uint32_t new_expected = *reinterpret_cast<volatile uint32_t*>(&awbarrier->split.expected);
+            new_expected &= ~0x40000000;
+            if (new_expected & 0x20000000) {
+                new_expected |= 0x40000000;
+            }
+            atomicAdd_block(&awbarrier->split.pending, new_expected);
+        }
+
+        return static_cast<uint64_t>(old_pending) << 32;
+    )
+}
+
+template<bool _Drop>
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
+
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+        uint64_t token;
+
+        if (_Drop) {
+            asm volatile ("mbarrier.arrive_drop.noComplete.shared.b64 %0, [%1], %2;"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
+                    : "memory");
+        } else {
+            asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
+                    : "memory");
+        }
+
+        return token;
+    )
+    NV_IF_TARGET(NV_PROVIDES_SM_70,
+        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
+
+        while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
+
+        if (_Drop) {
+            (void)atomicAdd_block(&awbarrier->split.expected, count);
+        }
+
+        return static_cast<uint64_t>(atomicAdd_block(&awbarrier->split.pending, count)) << 32;
+    )
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+bool awbarrier_test_wait(uint64_t* barrier, uint64_t token) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+    NV_IF_TARGET(NV_PROVIDES_SM_80,
+        uint32_t __wait_complete;
+
+        asm volatile ("{"
+                "    .reg .pred %%p;"
+                "    mbarrier.test_wait.shared.b64 %%p, [%1], %2;"
+                "    selp.b32 %0, 1, 0, %%p;"
+                "}"
+                : "=r"(__wait_complete)
+                : "r"(__nvvm_get_smem_pointer(barrier)), "l"(token)
+                : "memory");
+        return bool(__wait_complete);
+    )
+    NV_IF_TARGET(NV_PROVIDES_SM_70,
+        volatile AWBarrier* awbarrier = reinterpret_cast<volatile AWBarrier*>(barrier);
+
+        return ((token >> 32) ^ awbarrier->split.pending) & 0x80000000;
+    )
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+bool awbarrier_test_wait_parity(uint64_t* barrier, bool phase_parity) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    
+    NV_IF_TARGET(NV_PROVIDES_SM_90,
+        uint32_t __wait_complete = 0;
+
+        asm volatile ("{"
+                    ".reg .pred %%p;"
+                    "mbarrier.test_wait.parity.shared.b64 %%p, [%1], %2;"
+                    "selp.b32 %0, 1, 0, %%p;"
+                    "}"
+                : "=r"(__wait_complete)
+                : "r"(__nvvm_get_smem_pointer(barrier)), "r"(static_cast<uint32_t>(phase_parity))
+                : "memory");
+
+        return __wait_complete;
+    )
+    _CUDA_AWBARRIER_ABORT()
+    return false;
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+bool awbarrier_try_wait(uint64_t* barrier, uint64_t token, uint32_t max_sleep_nanosec) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    
+    NV_IF_TARGET(NV_PROVIDES_SM_90,
+        uint32_t __wait_complete = 0;
+
+        asm volatile ("{\n\t"
+                    ".reg .pred p;\n\t"
+                    "mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n\t"
+                    "selp.b32 %0, 1, 0, p;\n\t"
+                    "}"
+                : "=r"(__wait_complete)
+                : "r"(__nvvm_get_smem_pointer(barrier)), "l"(token), "r"(max_sleep_nanosec)
+                : "memory");
+
+        return __wait_complete;
+    )
+    _CUDA_AWBARRIER_ABORT()
+    return false;
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+bool awbarrier_try_wait_parity(uint64_t* barrier, bool phase_parity, uint32_t max_sleep_nanosec) {
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    
+    NV_IF_TARGET(NV_PROVIDES_SM_90,
+        uint32_t __wait_complete = 0;
+
+        asm volatile ("{\n\t"
+                    ".reg .pred p;\n\t"
+                    "mbarrier.try_wait.parity.shared.b64 p, [%1], %2, %3;\n\t"
+                    "selp.b32 %0, 1, 0, p;\n\t"
+                    "}"
+                : "=r"(__wait_complete)
+                : "r"(__nvvm_get_smem_pointer(barrier)), "r"(static_cast<uint32_t>(phase_parity)), "r"(max_sleep_nanosec)
+                : "memory");
+
+        return __wait_complete;
+    )
+    _CUDA_AWBARRIER_ABORT()
+    return false;
+}
+
+_CUDA_AWBARRIER_END_INTERNAL_NAMESPACE
+
+#endif /* !_CUDA_AWBARRIER_HELPERS_H_ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_device_runtime_api.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_device_runtime_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..07b7ec75b43325ea76b50f718822b1caf82cc9da
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_device_runtime_api.h
@@ -0,0 +1,914 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
+#define __CUDA_DEVICE_RUNTIME_API_H__
+
+#if defined(__CUDACC__) && !defined(__CUDACC_RTC__)
+#include <stdlib.h>
+#endif
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(CUDA_FORCE_CDP1_IF_SUPPORTED) && !defined(__CUDADEVRT_INTERNAL__) && !defined(_NVHPC_CUDA) && !(defined(_WIN32) && !defined(_WIN64))
+#define __CUDA_INTERNAL_USE_CDP2
+#endif
+
+#if !defined(__CUDACC_RTC__)
+
+#if !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) && !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct cudaFuncAttributes;
+
+// Bug 4398304
+// WAR for doxgyen processing duplicate entries causing warnings to be listed in the documentation
+/** \cond impl_private */
+
+#ifndef __CUDA_INTERNAL_USE_CDP2
+inline __device__  cudaError_t CUDARTAPI cudaMalloc(void **p, size_t s)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaGetDevice(int *device)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
+{
+  return cudaErrorUnknown;
+}
+#else // __CUDA_INTERNAL_USE_CDP2
+inline __device__  cudaError_t CUDARTAPI __cudaCDP2Malloc(void **p, size_t s)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI __cudaCDP2FuncGetAttributes(struct cudaFuncAttributes *p, const void *c)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI __cudaCDP2DeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI __cudaCDP2GetDevice(int *device)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
+{
+  return cudaErrorUnknown;
+}
+#endif // __CUDA_INTERNAL_USE_CDP2
+
+/** \endcond  */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) &&  !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
+
+#endif /* !defined(__CUDACC_RTC__) */
+
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+# define __DEPRECATED__(msg)
+#elif defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if defined(__CUDA_ARCH__) && !defined(__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING)
+# define __CDPRT_DEPRECATED(func_name) __DEPRECATED__("Use of "#func_name" from device code is deprecated. Moreover, such use will cause this module to fail to load on sm_90+ devices. If calls to "#func_name" from device code cannot be removed for older devices at this time, you may guard them with __CUDA_ARCH__ macros to remove them only for sm_90+ devices, making sure to generate code for compute_90 for the macros to take effect. Note that this mitigation will no longer work when support for "#func_name" from device code is eventually dropped for all devices. Disable this warning with -D__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING.")
+#else
+# define __CDPRT_DEPRECATED(func_name)
+#endif
+
+#if defined(__cplusplus) && defined(__CUDACC__)         /* Visible to nvcc front-end only */
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)   // Visible to SM>=3.5 and "__host__ __device__" only
+
+#include "driver_types.h"
+#include "crt/host_defines.h"
+
+#define cudaStreamGraphTailLaunch             (cudaStream_t)0x0100000000000000
+#define cudaStreamGraphFireAndForget          (cudaStream_t)0x0200000000000000
+#define cudaStreamGraphFireAndForgetAsSibling (cudaStream_t)0x0300000000000000
+
+#ifdef __CUDA_INTERNAL_USE_CDP2
+#define cudaStreamTailLaunch                ((cudaStream_t)0x3) /**< Per-grid stream with a tail launch semantics. Only applicable when used with CUDA Dynamic Parallelism. */
+#define cudaStreamFireAndForget             ((cudaStream_t)0x4) /**< Per-grid stream with a fire-and-forget synchronization behavior. Only applicable when used with CUDA Dynamic Parallelism. */
+#endif
+
+extern "C"
+{
+
+// Symbols beginning with __cudaCDP* should not be used outside
+// this header file. Instead, compile with -DCUDA_FORCE_CDP1_IF_SUPPORTED if
+// CDP1 support is required.
+
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaDeviceSynchronizeDeprecationAvoidance(void);
+
+// Bug 4398304
+// WAR for doxgyen processing duplicate entries causing warnings to be listed in the documentation
+/** \cond impl_private */
+
+#ifndef __CUDA_INTERNAL_USE_CDP2
+//// CDP1 endpoints
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+extern __DEPRECATED__("cudaDeviceGetSharedMemConfig deprecated") __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
+#if (__CUDA_ARCH__ < 900) && (defined(CUDA_FORCE_CDP1_IF_SUPPORTED) || (defined(_WIN32) && !defined(_WIN64)))
+// cudaDeviceSynchronize is removed on sm_90+
+extern __device__ __cudart_builtin__ __CDPRT_DEPRECATED(cudaDeviceSynchronize) cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
+#endif
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
+extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
+extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags_ptsz(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
+#endif // __CUDA_INTERNAL_USE_CDP2
+
+//// CDP2 endpoints
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2DeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2DeviceGetLimit(size_t *pValue, enum cudaLimit limit);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2DeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2DeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2GetLastError(void);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2PeekAtLastError(void);
+extern __device__ __cudart_builtin__ const char* CUDARTAPI __cudaCDP2GetErrorString(cudaError_t error);
+extern __device__ __cudart_builtin__ const char* CUDARTAPI __cudaCDP2GetErrorName(cudaError_t error);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2GetDeviceCount(int *count);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2GetDevice(int *device);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2StreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2StreamDestroy(cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2StreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2StreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventRecord(cudaEvent_t event, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventRecordWithFlags_ptsz(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2EventDestroy(cudaEvent_t event);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2FuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Free(void *devPtr);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Malloc(void **devPtr, size_t size);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2MemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2MemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2MemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2MemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2Memset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2RuntimeGetVersion(int *runtimeVersion);
+extern __device__ __cudart_builtin__ void * CUDARTAPI __cudaCDP2GetParameterBuffer(size_t alignment, size_t size);
+extern __device__ __cudart_builtin__ void * CUDARTAPI __cudaCDP2GetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2LaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2LaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2LaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2LaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
+
+
+extern  __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream);
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) 
+static inline  __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphLaunch_ptsz(cudaGraphExec_t graphExec, cudaStream_t stream)
+{
+    if (stream == 0) {
+        stream = cudaStreamPerThread;
+    }
+    return  cudaGraphLaunch(graphExec, stream);
+}
+#endif
+
+/** \endcond */
+
+/**
+  * \ingroup CUDART_GRAPH
+  * \brief Get the currently running device graph id.
+  *
+  * Get the currently running device graph id.
+  * \return Returns the current device graph id, 0 if the call is outside of a device graph.
+  * \sa cudaGraphLaunch
+  */
+static inline __device__ __cudart_builtin__ cudaGraphExec_t CUDARTAPI cudaGetCurrentGraphExec(void)
+{
+    unsigned long long current_graph_exec;
+    asm ("mov.u64 %0, %%current_graph_exec;" : "=l"(current_graph_exec));
+    return (cudaGraphExec_t)current_graph_exec;
+}
+
+/**
+ * \ingroup CUDART_GRAPH
+ * \brief Updates the kernel parameters of the given kernel node
+ *
+ * Updates \p size bytes in the kernel parameters of \p node at \p offset to
+ * the contents of \p value. \p node must be device-updatable, and must reside upon the same
+ * device as the calling kernel.
+ *
+ * If this function is called for the node's immediate dependent and that dependent is configured
+ * for programmatic dependent launch, then a memory fence must be invoked via __threadfence() before
+ * kickoff of the dependent is triggered via ::cudaTriggerProgrammaticLaunchCompletion() to ensure
+ * that the update is visible to that dependent node before it is launched.
+ *
+ * \param node      - The node to update
+ * \param offset    - The offset into the params at which to make the update
+ * \param value     - Buffer containing the params to write
+ * \param size      - Size in bytes to update
+ *
+ * \return
+ * cudaSucces,
+ * cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphKernelNodeSetEnabled,
+ * ::cudaGraphKernelNodeSetGridDim,
+ * ::cudaGraphKernelNodeUpdatesApply
+ */
+extern  __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParam(cudaGraphDeviceNode_t node, size_t offset, const void *value , size_t size);
+
+/**
+ * \ingroup CUDART_GRAPH
+ * \brief Enables or disables the given kernel node
+ *
+ * Enables or disables \p node based upon \p enable. If \p enable is true, the node will be enabled;
+ * if it is false, the node will be disabled. Disabled nodes will act as a NOP during execution.
+ * \p node must be device-updatable, and must reside upon the same device as the calling kernel.
+ *
+ * If this function is called for the node's immediate dependent and that dependent is configured
+ * for programmatic dependent launch, then a memory fence must be invoked via __threadfence() before
+ * kickoff of the dependent is triggered via ::cudaTriggerProgrammaticLaunchCompletion() to ensure
+ * that the update is visible to that dependent node before it is launched.
+ *
+ * \param node      - The node to update
+ * \param enable    - Whether to enable or disable the node
+ *
+ * \return
+ * cudaSucces,
+ * cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphKernelNodeSetParam,
+ * ::cudaGraphKernelNodeSetGridDim,
+ * ::cudaGraphKernelNodeUpdatesApply
+ */
+extern  __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetEnabled(cudaGraphDeviceNode_t node, bool enable);
+
+/**
+ * \ingroup CUDART_GRAPH
+ * \brief Updates the grid dimensions of the given kernel node
+ *
+ * Sets the grid dimensions of \p node to \p gridDim. \p node must be device-updatable,
+ * and must reside upon the same device as thecalling kernel.
+ *
+ * If this function is called for the node's immediate dependent and that dependent is configured
+ * for programmatic dependent launch, then a memory fence must be invoked via __threadfence() before
+ * kickoff of the dependent is triggered via ::cudaTriggerProgrammaticLaunchCompletion() to ensure
+ * that the update is visible to that dependent node before it is launched.
+ *
+ * \param node      - The node to update
+ * \param gridDim   - The grid dimensions to set
+ *
+ * \return
+ * cudaSucces,
+ * cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphKernelNodeSetParam,
+ * ::cudaGraphKernelNodeSetEnabled,
+ * ::cudaGraphKernelNodeUpdatesApply
+ */
+extern  __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetGridDim(cudaGraphDeviceNode_t node, dim3 gridDim);
+
+/**
+ * \ingroup CUDART_GRAPH
+ * \brief Batch applies multiple kernel node updates
+ *
+ * Batch applies one or more kernel node updates based on the information provided in \p updates.
+ * \p updateCount specifies the number of updates to apply. Each entry in \p updates must specify
+ * a node to update, the type of update to apply, and the parameters for that type of update. See
+ * the documentation for ::cudaGraphKernelNodeUpdate for more detail.
+ *
+ * If this function is called for the node's immediate dependent and that dependent is configured
+ * for programmatic dependent launch, then a memory fence must be invoked via __threadfence() before
+ * kickoff of the dependent is triggered via ::cudaTriggerProgrammaticLaunchCompletion() to ensure
+ * that the update is visible to that dependent node before it is launched.
+ *
+ * \param updates     - The updates to apply
+ * \param updateCount - The number of updates to apply
+ *
+ * \return
+ * cudaSucces,
+ * cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphKernelNodeSetParam,
+ * ::cudaGraphKernelNodeSetEnabled,
+ * ::cudaGraphKernelNodeSetGridDim
+ */
+extern  __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphKernelNodeUpdatesApply(const cudaGraphKernelNodeUpdate *updates, size_t updateCount);
+
+/**
+  * \ingroup CUDART_EXECUTION
+  * \brief Programmatic dependency trigger
+  *
+  * This device function ensures the programmatic launch completion edges /
+  * events are fulfilled. See
+  * ::cudaLaunchAttributeID::cudaLaunchAttributeProgrammaticStreamSerialization
+  * and ::cudaLaunchAttributeID::cudaLaunchAttributeProgrammaticEvent for more
+  * information. The event / edge kick off only happens when every CTAs
+  * in the grid has either exited or called this function at least once,
+  * otherwise the kick off happens automatically after all warps finishes
+  * execution but before the grid completes. The kick off only enables
+  * scheduling of the secondary kernel. It provides no memory visibility
+  * guarantee itself. The user could enforce memory visibility by inserting a
+  * memory fence of the correct scope.
+  */
+static inline __device__ __cudart_builtin__ void CUDARTAPI cudaTriggerProgrammaticLaunchCompletion(void)
+{
+    asm volatile("griddepcontrol.launch_dependents;":::);
+}
+
+/**
+  * \ingroup CUDART_EXECUTION
+  * \brief Programmatic grid dependency synchronization
+  *
+  * This device function will block the thread until all direct grid
+  * dependencies have completed. This API is intended to use in conjuncture with
+  * programmatic / launch event / dependency. See
+  * ::cudaLaunchAttributeID::cudaLaunchAttributeProgrammaticStreamSerialization
+  * and ::cudaLaunchAttributeID::cudaLaunchAttributeProgrammaticEvent for more
+  * information.
+  */
+static inline __device__ __cudart_builtin__ void CUDARTAPI cudaGridDependencySynchronize(void)
+{
+    asm volatile("griddepcontrol.wait;":::"memory");
+}
+
+/**
+  * \ingroup CUDART_GRAPH
+  * \brief Sets the condition value associated with a conditional node.
+  *
+  * Sets the condition value associated with a conditional node.
+  *
+  * Note: \p handle must be associated with the same context as the kernel calling this function.
+  *
+  * \sa cudaGraphConditionalHandleCreate
+  */
+extern __device__ __cudart_builtin__ void CUDARTAPI cudaGraphSetConditional(cudaGraphConditionalHandle handle, unsigned int value);
+
+//// CG API
+extern __device__ __cudart_builtin__ unsigned long long CUDARTAPI cudaCGGetIntrinsicHandle(enum cudaCGScope scope);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronize(unsigned long long handle, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronizeGrid(unsigned long long handle, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids, unsigned long long handle);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank, unsigned long long handle);
+
+
+//// CDP API
+
+#ifdef __CUDA_ARCH__
+
+// Bug 4398304
+// WAR for doxgyen processing duplicate entries causing warnings to be listed in the documentation
+/** \cond impl_private */
+
+#ifdef __CUDA_INTERNAL_USE_CDP2
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
+{
+    return __cudaCDP2DeviceGetAttribute(value, attr, device);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit)
+{
+    return __cudaCDP2DeviceGetLimit(pValue, limit);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig)
+{
+    return __cudaCDP2DeviceGetCacheConfig(pCacheConfig);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig)
+{
+    return __cudaCDP2DeviceGetSharedMemConfig(pConfig);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void)
+{
+    return __cudaCDP2GetLastError();
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void)
+{
+    return __cudaCDP2PeekAtLastError();
+}
+
+static __inline__ __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error)
+{
+    return __cudaCDP2GetErrorString(error);
+}
+
+static __inline__ __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error)
+{
+    return __cudaCDP2GetErrorName(error);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count)
+{
+    return __cudaCDP2GetDeviceCount(count);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device)
+{
+    return __cudaCDP2GetDevice(device);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags)
+{
+    return __cudaCDP2StreamCreateWithFlags(pStream, flags);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream)
+{
+    return __cudaCDP2StreamDestroy(stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
+{
+    return __cudaCDP2StreamWaitEvent(stream, event, flags);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags)
+{
+    return __cudaCDP2StreamWaitEvent_ptsz(stream, event, flags);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags)
+{
+    return __cudaCDP2EventCreateWithFlags(event, flags);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream)
+{
+    return __cudaCDP2EventRecord(event, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream)
+{
+    return __cudaCDP2EventRecord_ptsz(event, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags)
+{
+    return __cudaCDP2EventRecordWithFlags(event, stream, flags);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags_ptsz(cudaEvent_t event, cudaStream_t stream, unsigned int flags)
+{
+    return __cudaCDP2EventRecordWithFlags_ptsz(event, stream, flags);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event)
+{
+    return __cudaCDP2EventDestroy(event);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func)
+{
+    return __cudaCDP2FuncGetAttributes(attr, func);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr)
+{
+    return __cudaCDP2Free(devPtr);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size)
+{
+    return __cudaCDP2Malloc(devPtr, size);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream)
+{
+    return __cudaCDP2MemcpyAsync(dst, src, count, kind, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream)
+{
+    return __cudaCDP2MemcpyAsync_ptsz(dst, src, count, kind, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream)
+{
+    return __cudaCDP2Memcpy2DAsync(dst, dpitch, src, spitch, width, height, kind, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream)
+{
+    return __cudaCDP2Memcpy2DAsync_ptsz(dst, dpitch, src, spitch, width, height, kind, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream)
+{
+    return __cudaCDP2Memcpy3DAsync(p, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream)
+{
+    return __cudaCDP2Memcpy3DAsync_ptsz(p, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream)
+{
+    return __cudaCDP2MemsetAsync(devPtr, value, count, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream)
+{
+    return __cudaCDP2MemsetAsync_ptsz(devPtr, value, count, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream)
+{
+    return __cudaCDP2Memset2DAsync(devPtr, pitch, value, width, height, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream)
+{
+    return __cudaCDP2Memset2DAsync_ptsz(devPtr, pitch, value, width, height, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream)
+{
+    return __cudaCDP2Memset3DAsync(pitchedDevPtr, value, extent, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream)
+{
+    return __cudaCDP2Memset3DAsync_ptsz(pitchedDevPtr, value, extent, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion)
+{
+    return __cudaCDP2RuntimeGetVersion(runtimeVersion);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
+{
+    return __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSmemSize);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
+{
+    return __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func, blockSize, dynamicSmemSize, flags);
+}
+#endif // __CUDA_INTERNAL_USE_CDP2
+
+/** \endcond */
+
+#endif // __CUDA_ARCH__
+
+
+/**
+ * \ingroup CUDART_EXECUTION
+ * \brief Obtains a parameter buffer
+ *
+ * Obtains a parameter buffer which can be filled with parameters for a kernel launch.
+ * Parameters passed to ::cudaLaunchDevice must be allocated via this function.
+ *
+ * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
+ * CUDA user code should use <<< >>> to launch kernels.
+ *
+ * \param alignment - Specifies alignment requirement of the parameter buffer
+ * \param size      - Specifies size requirement in bytes
+ *
+ * \return
+ * Returns pointer to the allocated parameterBuffer
+ * \notefnerr
+ *
+ * \sa cudaLaunchDevice
+ */
+#ifdef __CUDA_INTERNAL_USE_CDP2
+static __inline__ __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size)
+{
+    return __cudaCDP2GetParameterBuffer(alignment, size);
+}
+#else
+extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
+#endif
+
+
+#ifdef __CUDA_INTERNAL_USE_CDP2
+static __inline__ __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize)
+{
+    return __cudaCDP2GetParameterBufferV2(func, gridDimension, blockDimension, sharedMemSize);
+}
+#else
+extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
+#endif
+
+
+#ifdef __CUDA_INTERNAL_USE_CDP2
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
+{
+    return __cudaCDP2LaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
+}
+
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream)
+{
+    return __cudaCDP2LaunchDeviceV2_ptsz(parameterBuffer, stream);
+}
+#else
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
+#endif
+
+
+/**
+ * \ingroup CUDART_EXECUTION
+ * \brief Launches a specified kernel
+ *
+ * Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
+ * by calling ::cudaGetParameterBuffer().
+ *
+ * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
+ * CUDA user code should use <<< >>> to launch the kernels.
+ *
+ * \param func            - Pointer to the kernel to be launched
+ * \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
+ * \param gridDimension   - Specifies grid dimensions
+ * \param blockDimension  - Specifies block dimensions
+ * \param sharedMemSize   - Specifies size of shared memory
+ * \param stream          - Specifies the stream to be used
+ *
+ * \return
+ * ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
+ * \notefnerr
+ * \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
+ * Guide for the detailed descriptions of launch configuration and parameter layout respectively.
+ *
+ * \sa cudaGetParameterBuffer
+ */
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
+    // When compiling for the device and per thread default stream is enabled, add
+    // a static inline redirect to the per thread stream entry points.
+
+    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
+    cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
+    {
+#ifdef __CUDA_INTERNAL_USE_CDP2
+        return __cudaCDP2LaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
+#else
+        return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
+#endif
+    }
+
+    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
+    cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
+    {
+#ifdef __CUDA_INTERNAL_USE_CDP2
+        return __cudaCDP2LaunchDeviceV2_ptsz(parameterBuffer, stream);
+#else
+        return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
+#endif
+    }
+#else // defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
+#ifdef __CUDA_INTERNAL_USE_CDP2
+    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
+    {
+        return __cudaCDP2LaunchDevice(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
+    }
+
+    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
+    {
+        return __cudaCDP2LaunchDeviceV2(parameterBuffer, stream);
+    }
+#else // __CUDA_INTERNAL_USE_CDP2
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
+#endif // __CUDA_INTERNAL_USE_CDP2
+#endif // defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
+
+
+// These symbols should not be used outside of this header file.
+#define __cudaCDP2DeviceGetAttribute
+#define __cudaCDP2DeviceGetLimit
+#define __cudaCDP2DeviceGetCacheConfig
+#define __cudaCDP2DeviceGetSharedMemConfig
+#define __cudaCDP2GetLastError
+#define __cudaCDP2PeekAtLastError
+#define __cudaCDP2GetErrorString
+#define __cudaCDP2GetErrorName
+#define __cudaCDP2GetDeviceCount
+#define __cudaCDP2GetDevice
+#define __cudaCDP2StreamCreateWithFlags
+#define __cudaCDP2StreamDestroy
+#define __cudaCDP2StreamWaitEvent
+#define __cudaCDP2StreamWaitEvent_ptsz
+#define __cudaCDP2EventCreateWithFlags
+#define __cudaCDP2EventRecord
+#define __cudaCDP2EventRecord_ptsz
+#define __cudaCDP2EventRecordWithFlags
+#define __cudaCDP2EventRecordWithFlags_ptsz
+#define __cudaCDP2EventDestroy
+#define __cudaCDP2FuncGetAttributes
+#define __cudaCDP2Free
+#define __cudaCDP2Malloc
+#define __cudaCDP2MemcpyAsync
+#define __cudaCDP2MemcpyAsync_ptsz
+#define __cudaCDP2Memcpy2DAsync
+#define __cudaCDP2Memcpy2DAsync_ptsz
+#define __cudaCDP2Memcpy3DAsync
+#define __cudaCDP2Memcpy3DAsync_ptsz
+#define __cudaCDP2MemsetAsync
+#define __cudaCDP2MemsetAsync_ptsz
+#define __cudaCDP2Memset2DAsync
+#define __cudaCDP2Memset2DAsync_ptsz
+#define __cudaCDP2Memset3DAsync
+#define __cudaCDP2Memset3DAsync_ptsz
+#define __cudaCDP2RuntimeGetVersion
+#define __cudaCDP2GetParameterBuffer
+#define __cudaCDP2GetParameterBufferV2
+#define __cudaCDP2LaunchDevice_ptsz
+#define __cudaCDP2LaunchDeviceV2_ptsz
+#define __cudaCDP2LaunchDevice
+#define __cudaCDP2LaunchDeviceV2
+#define __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessor
+#define __cudaCDP2OccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+
+}
+
+// Bug 4398304
+// WAR for doxgyen processing duplicate entries causing warnings to be listed in the documentation
+/** \cond impl_private */
+
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
+
+/** \endcond */
+
+/**
+ * \ingroup CUDART_GRAPH
+ * \brief Updates the kernel parameters of the given kernel node
+ *
+ * Updates the kernel parameters of \p node at \p offset to \p value. \p node must be
+ * device-updatable, and must reside upon the same device as the calling kernel.
+ *
+ * If this function is called for the node's immediate dependent and that dependent is configured
+ * for programmatic dependent launch, then a memory fence must be invoked via __threadfence() before
+ * kickoff of the dependent is triggered via ::cudaTriggerProgrammaticLaunchCompletion() to ensure
+ * that the update is visible to that dependent node before it is launched.
+ *
+ * \param node      - The node to update
+ * \param offset    - The offset into the params at which to make the update
+ * \param value     - Parameter value to write
+ *
+ * \return
+ * cudaSucces,
+ * cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::etblGraphKernelNodeSetEnabled,
+ * ::etblGraphKernelNodeSetGridDim,
+ * ::etblGraphKernelNodeUpdatesApply
+ */
+template <typename T>
+static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParam(cudaGraphDeviceNode_t node, size_t offset, const T &value)
+{
+    return cudaGraphKernelNodeSetParam(node, offset, &value, sizeof(T));
+}
+
+#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
+#endif /* defined(__cplusplus) && defined(__CUDACC__) */
+
+#undef __DEPRECATED__
+#undef __CDPRT_DEPRECATED
+#undef __CUDA_INTERNAL_USE_CDP2
+
+#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp4.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp4.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b291e7b78610171e9068b39ed5c503b35f45ad4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp4.h
@@ -0,0 +1,357 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef __CUDA_FP4_H__
+#define __CUDA_FP4_H__
+
+/* Set up function decorations */
+#if defined(__CUDACC__)
+#define __CUDA_FP4_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_FP4__ __host__ __device__
+#define __CUDA_HOSTDEVICE_FP4_DECL__ static __host__ __device__ __inline__
+#else /* !defined(__CUDACC__) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_FP4_DECL__ static __attribute__((unused))
+#else
+#define __CUDA_HOSTDEVICE_FP4_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE_FP4__
+#endif /* defined(__CUDACC_) */
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP4
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP4
+#endif
+
+/* bring in fp6 types infrastructure and dependencies */
+#include "cuda_fp6.h"
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_FP4 FP4 Intrinsics
+ * This section describes fp4 intrinsic functions.
+ * To use these functions, include the header file \p cuda_fp4.h in your
+ * program.
+ *
+ * \note Most of the operations defined here benefit from native HW support
+ * when compiled for specific GPU targets (e.g. devices of compute capability 10.0a),
+ * other targets use emulation path.
+ *
+ * The following macros are available to help users selectively enable/disable
+ * various definitions present in the header file:
+ * - \p __CUDA_NO_FP4_CONVERSIONS__ - If defined, this macro will prevent any
+ * use of the C++ type conversions (converting constructors and conversion
+ * operators) defined in the header.
+ * - \p __CUDA_NO_FP4_CONVERSION_OPERATORS__ - If defined, this macro will
+ * prevent any use of the  C++ conversion operators from \p fp4 to other types.
+ */
+
+/**
+ * \defgroup CUDA_MATH_FP4_MISC FP4 Conversion and Data Movement
+ * \ingroup CUDA_MATH_INTRINSIC_FP4
+ * To use these functions, include the header file \p cuda_fp4.h in your
+ * program.
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief 8-bit \p unsigned \p integer
+ * type abstraction used for \p fp4 floating-point
+ * numbers storage.
+ */
+typedef __nv_fp8_storage_t __nv_fp4_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief 8-bit \p unsigned \p integer
+ * type abstraction used for storage of pairs of
+ * \p fp4 floating-point numbers.
+ */
+typedef __nv_fp8_storage_t __nv_fp4x2_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief 16-bit \p unsigned \p integer
+ * type abstraction used for storage of tetrads of
+ * \p fp4 floating-point numbers.
+ */
+typedef __nv_fp8x2_storage_t __nv_fp4x4_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Enumerates the possible
+ * interpretations of the 4-bit values when referring to them as
+ * \p fp4 types.
+ */
+typedef enum __nv_fp4_interpretation_t {
+    __NV_E2M1, /**< Stands for \p fp4 numbers of \p e2m1 kind. */
+} __nv_fp4_interpretation_t;
+
+/* Forward-declaration of C-style APIs */
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input \p double precision \p x to \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp4 type of the kind specified by
+ * \p fp4_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t
+__nv_cvt_double_to_fp4(const double x,
+                       const __nv_fp4_interpretation_t fp4_interpretation,
+                       const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input vector of two \p double precision numbers packed
+ * in \p double2 \p x into a vector of two values of \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp4 values of the
+ * kind specified by \p fp4_interpretation parameter, using
+ * rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t
+__nv_cvt_double2_to_fp4x2(const double2 x,
+                          const __nv_fp4_interpretation_t fp4_interpretation,
+                          const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input \p single precision \p x to \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp4 type of the kind specified by
+ * \p fp4_interpretation parameter, using
+ * rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t
+__nv_cvt_float_to_fp4(const float x,
+                      const __nv_fp4_interpretation_t fp4_interpretation,
+                      const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input vector of two \p single precision numbers packed
+ * in \p float2 \p x into a vector of two values of \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp4 values of the
+ * kind specified by \p fp4_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t
+__nv_cvt_float2_to_fp4x2(const float2 x,
+                         const __nv_fp4_interpretation_t fp4_interpretation,
+                         const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input \p half precision \p x to \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp4 type of the kind specified by
+ * \p fp4_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t
+__nv_cvt_halfraw_to_fp4(const __half_raw x,
+                        const __nv_fp4_interpretation_t fp4_interpretation,
+                        const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input vector of two \p half precision numbers packed
+ * in \p __half2_raw \p x into a vector of two values of \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp4 values of the
+ * kind specified by \p fp4_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t __nv_cvt_halfraw2_to_fp4x2(
+    const __half2_raw x,
+    const __nv_fp4_interpretation_t fp4_interpretation,
+    const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input \p nv_bfloat16 precision \p x to \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp4 type of the kind specified by
+ * \p fp4_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t __nv_cvt_bfloat16raw_to_fp4(
+    const __nv_bfloat16_raw x,
+    const __nv_fp4_interpretation_t fp4_interpretation,
+    const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input vector of two \p nv_bfloat16 precision numbers packed
+ * in \p __nv_bfloat162_raw \p x into a vector of two values of \p fp4 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp4 values of the
+ * kind specified by \p fp4_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp4x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp4x2(
+    const __nv_bfloat162_raw x,
+    const __nv_fp4_interpretation_t fp4_interpretation,
+    const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input \p fp4 \p x of the specified kind
+ * to \p half precision.
+ *
+ * \details Converts input \p x of \p fp4 type of the kind specified by
+ * \p fp4_interpretation parameter
+ * to \p half precision.
+ *
+ * \returns
+ * - The \p __half_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __half_raw
+__nv_cvt_fp4_to_halfraw(const __nv_fp4_storage_t x,
+                        const __nv_fp4_interpretation_t fp4_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP4_MISC
+ * \brief Converts input vector of two \p fp4 values of the specified kind
+ * to a vector of two \p half precision values packed in \p __half2_raw
+ * structure.
+ *
+ * \details Converts input vector \p x of \p fp4 type of the kind specified by
+ * \p fp4_interpretation parameter
+ * to a vector of two \p half precision values and returns as \p __half2_raw
+ * structure.
+ *
+ * \returns
+ * - The \p __half2_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP4_DECL__ __half2_raw
+__nv_cvt_fp4x2_to_halfraw2(const __nv_fp4x2_storage_t x,
+                           const __nv_fp4_interpretation_t fp4_interpretation);
+
+#if defined(__cplusplus)
+
+#define __CUDA_FP4_TYPES_EXIST__
+
+/* Forward-declaration of structures defined in "cuda_fp4.hpp" */
+struct __nv_fp4_e2m1;
+struct __nv_fp4x2_e2m1;
+struct __nv_fp4x4_e2m1;
+
+#endif /* defined(__cplusplus) */
+
+#include "cuda_fp4.hpp"
+
+#undef __CUDA_FP4_DECL__
+#undef __CUDA_HOSTDEVICE_FP4__
+#undef __CUDA_HOSTDEVICE_FP4_DECL__
+
+#if defined(__CPP_VERSION_AT_LEAST_11_FP4)
+#undef __CPP_VERSION_AT_LEAST_11_FP4
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP4) */
+
+#endif /* end of include guard: __CUDA_FP4_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp4.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp4.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..97716a433d797c49961ee0ba33b4fb84e9b3bb1d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp4.hpp
@@ -0,0 +1,953 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_FP4_HPP__)
+#define __CUDA_FP4_HPP__
+
+#if !defined(__CUDA_FP4_H__)
+#error "Do not include this file directly. Instead, include cuda_fp4.h."
+#endif
+
+/* C++ header for std::memcpy (used for type punning in host-side
+ * implementations). When compiling as a CUDA source file memcpy is provided
+ * implicitly. !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#elif !defined(__cplusplus) && !defined(__CUDACC__)
+#include <string.h>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+/*
+ * Bring in the standard assertions header to enforce the subset
+ * of rounding modes supported by the APIs defined here.
+ * NOTE: NVRTC defines its own assert
+ */
+#if !defined (__CUDACC_RTC__)
+#include <assert.h>
+#endif
+
+/* Set up structure-alignment attribute */
+#if !(defined __CUDA_ALIGN__)
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas"
+ * is available) */
+#if __cplusplus >= 201103L
+#define __CUDA_ALIGN__(n)                                                      \
+    alignas(n) /* C++11 kindly gives us a keyword for this */
+#else          /* !defined(__CPP_VERSION_AT_LEAST_11_FP4)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP4) */
+#endif /* defined(__CUDACC__) */
+#endif /* !(defined __CUDA_ALIGN__) */
+
+#if !(defined __CPP_VERSION_AT_LEAST_11_FP4)
+/* need c++11 for explicit operators */
+#define __CUDA_NO_FP4_CONVERSION_OPERATORS__
+#endif
+
+#if !(defined __DOXYGEN_ONLY__)
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t
+__nv_cvt_double_to_fp4(const double x,
+                       const __nv_fp4_interpretation_t fp4_interpretation,
+                       const enum cudaRoundMode rounding) {
+    unsigned char res;
+    unsigned long long int xbits;
+
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&xbits, &x, sizeof(x));
+#else
+    (void)std::memcpy(&xbits, &x, sizeof(x));
+#endif
+    unsigned char FP4_MAXNORM;
+    unsigned char FP4_MANTISSA_MASK;
+    unsigned short int FP4_EXP_BIAS;
+    unsigned long long int FP4_SIGNIFICAND_BITS;
+    unsigned long long int FP4_MINDENORM_O2;
+    unsigned long long int FP4_OVERFLOW_THRESHOLD;
+    unsigned long long int FP4_MINNORM;
+    const unsigned long long int DP_INF_BITS = 0x7FF0000000000000ULL;
+
+    // fp4_interpretation == __NV_E2M1
+    FP4_EXP_BIAS = 1U;
+    FP4_SIGNIFICAND_BITS = 2ULL;
+    FP4_MANTISSA_MASK = 0x1U;
+    FP4_MINDENORM_O2 = 0x3FD0000000000000ULL; // mindenorm/2 = 2^-2
+    FP4_OVERFLOW_THRESHOLD =
+        0x4018000000000000ULL; // maxnorm = 6.0
+    FP4_MAXNORM = 0x7U;
+    FP4_MINNORM = 0x3FF0000000000000ULL; // minnorm = 2^0
+
+    // 1/2 LSB of the target format, positioned in double precision mantissa
+    // helpful in midpoints detection during round-to-nearest-even step
+    const unsigned long long int FP4_DP_HALF_ULP =
+        (unsigned long long int)1ULL << (53ULL - FP4_SIGNIFICAND_BITS - 1ULL);
+    // prepare sign bit in target format
+    unsigned char sign = (unsigned char)((xbits >> 63ULL) << 3U);
+    // prepare exponent field in target format
+    unsigned char exp =
+        (unsigned char)((((unsigned short int)(xbits >> 52ULL)) & 0x7FFU) -
+                        1023U + FP4_EXP_BIAS);
+    // round mantissa to target format width, rounding towards zero
+    unsigned char mantissa =
+        (unsigned char)(xbits >> (53ULL - FP4_SIGNIFICAND_BITS)) &
+        FP4_MANTISSA_MASK;
+    unsigned long long int absx = xbits & 0x7FFFFFFFFFFFFFFFULL;
+
+    if (absx <= FP4_MINDENORM_O2) {
+        // zero or underflow
+        res = 0U;
+    } else if (absx > FP4_OVERFLOW_THRESHOLD) {
+        // overflow or NaN
+        if (absx > DP_INF_BITS)
+        {
+            // NaN converts to positive FP4_MAXNORM
+            sign = 0U;
+        }
+        res = FP4_MAXNORM;
+    } else if (absx >= FP4_MINNORM) {
+        res = (unsigned char)((exp << (FP4_SIGNIFICAND_BITS - 1U)) | mantissa);
+        // rounded-off bits
+        unsigned long long int round =
+            xbits & ((FP4_DP_HALF_ULP << 1ULL) - 1ULL);
+        if (rounding == cudaRoundNearest)
+        {
+            // round-to-nearest-even adjustment
+            if ((round > FP4_DP_HALF_ULP) ||
+                ((round == FP4_DP_HALF_ULP) && (mantissa & 1U))) {
+                res = (unsigned char)(res + 1U);
+            }
+        } else {
+            assert(rounding == cudaRoundZero);
+        }
+    } else // Denormal range
+    {
+        unsigned char shift = (unsigned char)(1U - exp);
+        // add implicit leading bit
+        mantissa |= (unsigned char)(1U << (FP4_SIGNIFICAND_BITS - 1U));
+        // additional round-off due to denormalization
+        res = (unsigned char)(mantissa >> shift);
+
+        if (rounding == cudaRoundNearest)
+        {
+            // rounded-off bits, including implicit leading bit
+            unsigned long long int round =
+                (xbits | ((unsigned long long int)1ULL << (53ULL - 1ULL))) &
+                ((FP4_DP_HALF_ULP << (shift + 1ULL)) - 1ULL);
+            // round-to-nearest-even adjustment
+            if ((round > (FP4_DP_HALF_ULP << shift)) ||
+                ((round == (FP4_DP_HALF_ULP << shift)) && (res & 1U))) {
+                res = (unsigned char)(res + 1U);
+            }
+        } else {
+            assert(rounding == cudaRoundZero);
+        }
+    }
+
+    res |= sign;
+
+    return (__nv_fp4_storage_t)res;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t
+__nv_cvt_double2_to_fp4x2(const double2 x,
+                          const __nv_fp4_interpretation_t fp4_interpretation,
+                          const enum cudaRoundMode rounding) {
+    __nv_fp4x2_storage_t storage = (__nv_fp4x2_storage_t)__nv_cvt_double_to_fp4(
+        x.y, fp4_interpretation, rounding);
+    storage = (__nv_fp4x2_storage_t)(storage << 4U);
+    storage = (__nv_fp4x2_storage_t)(storage |
+                                     __nv_cvt_double_to_fp4(
+                                         x.x, fp4_interpretation, rounding));
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t
+__nv_cvt_float_to_fp4(const float x,
+                      const __nv_fp4_interpretation_t fp4_interpretation,
+                      const enum cudaRoundMode rounding) {
+    __nv_fp4_storage_t res = 0U;
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    if (rounding == cudaRoundNearest)
+    {
+        unsigned short storage;
+        // fp4_interpretation == __NV_E2M1
+        asm("{ .reg .b8 __$temp1;                           \n"
+            " cvt.rn.satfinite.e2m1x2.f32 __$temp1, %2, %1; \n"
+            " mov.b16 %0, {__$temp1, 0};                   }\n"
+            : "=h"(storage)
+            : "f"(x), "f"(0.0f));
+        res = (__nv_fp4_storage_t)storage;
+    } else
+#endif
+    {
+        res = __nv_cvt_double_to_fp4((double)x, fp4_interpretation, rounding);
+    }
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t
+__nv_cvt_float2_to_fp4x2(const float2 x,
+                         const __nv_fp4_interpretation_t fp4_interpretation,
+                         const enum cudaRoundMode rounding) {
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+    unsigned short storage;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    if (rounding == cudaRoundNearest) {
+        // fp4_interpretation == __NV_E2M1
+        asm("{ .reg .b8 __$temp1;                           \n"
+            " cvt.rn.satfinite.e2m1x2.f32 __$temp1, %2, %1; \n"
+            " mov.b16 %0, {__$temp1, 0};                   }\n"
+            : "=h"(storage)
+            : "f"(x.x), "f"(x.y));
+    } else
+#endif
+    {
+        storage = (__nv_fp4x2_storage_t)__nv_cvt_float_to_fp4(
+            x.y, fp4_interpretation, rounding);
+        storage = (__nv_fp4x2_storage_t)(storage << 4U);
+        storage = (__nv_fp4x2_storage_t)(storage | __nv_cvt_float_to_fp4(
+                                                       x.x,
+                                                       fp4_interpretation, rounding));
+    }
+    return (__nv_fp4x2_storage_t)storage;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t
+__nv_cvt_halfraw_to_fp4(const __half_raw x,
+                        const __nv_fp4_interpretation_t fp4_interpretation,
+                        const enum cudaRoundMode rounding) {
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+    __nv_fp4_storage_t res = 0U;
+    float fx = __internal_halfraw_to_float(x);
+    res = __nv_cvt_float_to_fp4(fx, fp4_interpretation, rounding);
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t __nv_cvt_halfraw2_to_fp4x2(
+    const __half2_raw x,
+    const __nv_fp4_interpretation_t fp4_interpretation,
+    const enum cudaRoundMode rounding) {
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+    unsigned short tmp;
+    __half_raw raw;
+    raw.x = x.x;
+    __nv_fp4_storage_t lo =
+        __nv_cvt_halfraw_to_fp4(raw, fp4_interpretation, rounding);
+    raw.x = x.y;
+    __nv_fp4_storage_t hi =
+        __nv_cvt_halfraw_to_fp4(raw, fp4_interpretation, rounding);
+    tmp = hi;
+    tmp = (__nv_fp4x2_storage_t)(tmp << 4U);
+    tmp = (__nv_fp4x2_storage_t)(tmp | lo);
+    return (__nv_fp4x2_storage_t)tmp;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4_storage_t __nv_cvt_bfloat16raw_to_fp4(
+    const __nv_bfloat16_raw x,
+    const __nv_fp4_interpretation_t fp4_interpretation,
+    const enum cudaRoundMode rounding) {
+    const float fx = __internal_bf16raw_to_float(x);
+    const __nv_fp4_storage_t res =
+        __nv_cvt_float_to_fp4(fx, fp4_interpretation, rounding);
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __nv_fp4x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp4x2(
+    const __nv_bfloat162_raw x,
+    const __nv_fp4_interpretation_t fp4_interpretation,
+    const enum cudaRoundMode rounding) {
+    __nv_bfloat16_raw raw;
+    raw.x = x.y;
+    __nv_fp4x2_storage_t storage =
+        (__nv_fp4x2_storage_t)__nv_cvt_bfloat16raw_to_fp4(raw,
+                                        fp4_interpretation, rounding);
+    storage = (__nv_fp4x2_storage_t)(storage << 4U);
+    raw.x = x.x;
+    storage = (__nv_fp4x2_storage_t)(storage |
+                                     __nv_cvt_bfloat16raw_to_fp4(raw,
+                                        fp4_interpretation, rounding));
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __half2_raw
+__nv_cvt_fp4x2_to_halfraw2(const __nv_fp4x2_storage_t x,
+                           const __nv_fp4_interpretation_t fp4_interpretation);
+__CUDA_HOSTDEVICE_FP4_DECL__ __half_raw
+__nv_cvt_fp4_to_halfraw(const __nv_fp4_storage_t x,
+                        const __nv_fp4_interpretation_t fp4_interpretation) {
+    __half_raw res;
+    res.x = 0U;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    res.x =
+        __nv_cvt_fp4x2_to_halfraw2((__nv_fp4x2_storage_t)x, fp4_interpretation)
+            .x;
+#else
+    {
+        // fp4_interpretation == __NV_E2M1
+        // convert to e2m3 first
+        __nv_fp6_storage_t fp6e2m3 = (x & 0xFU) << 2U;
+        res = __nv_cvt_fp6_to_halfraw(fp6e2m3, __NV_E2M3);
+    }
+#endif
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ __half2_raw
+__nv_cvt_fp4x2_to_halfraw2(const __nv_fp4x2_storage_t x,
+                           const __nv_fp4_interpretation_t fp4_interpretation) {
+    __half2_raw res;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    unsigned int half2_storage;
+    unsigned short tmp = (unsigned short)x;
+    asm("{ .reg .b8 __$temp1, __$tempz;                 \n"
+        " mov.b16 {__$temp1, __$tempz}, %1;             \n"
+        " cvt.rn.f16x2.e2m1x2 %0, __$temp1;            }\n"
+        : "=r"(half2_storage)
+        : "h"(tmp));
+    (void)memcpy(&res, &half2_storage, sizeof(half2_storage));
+#else
+    res.x =
+        __nv_cvt_fp4_to_halfraw((__nv_fp4_storage_t)x, fp4_interpretation).x;
+    res.y = __nv_cvt_fp4_to_halfraw((__nv_fp4_storage_t)(x >> 4U),
+                                    fp4_interpretation)
+                .x;
+#endif
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP4_DECL__ unsigned short int
+__internal_pack_u8x2_to_u16(const unsigned char src_lo,
+                            const unsigned char src_hi) {
+    return (((unsigned short int)src_hi) << 8U) |
+            ((unsigned short int)src_lo);
+}
+
+#endif /* !(defined __DOXYGEN_ONLY__) */
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/**
+ * \defgroup CUDA_MATH_FP4_E2M1_STRUCT C++ struct for handling fp4 data type of e2m1 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP4
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP4_E2M1_STRUCT
+ * \brief __nv_fp4_e2m1 datatype
+ *
+ * \details This structure implements the datatype for handling
+ * \p fp4 floating-point numbers of \p e2m1 kind:
+ * with 1 sign, 2 exponent, 1 implicit and 1 explicit mantissa bits.
+ * This encoding does not support Inf/NaN.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp4_e2m1 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP4_E2M1_STRUCT
+     * Storage variable contains the \p fp4 floating-point data.
+     */
+    __nv_fp4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP4)
+    __nv_fp4_e2m1() = default;
+#else
+    __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP4) */
+
+#if !defined(__CUDA_NO_FP4_CONVERSIONS__)
+
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const __half f) {
+        __x = __nv_cvt_halfraw_to_fp4(static_cast<__half_raw>(f),
+                                      __NV_E2M1, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_fp4(static_cast<__nv_bfloat16_raw>(f),
+                                          __NV_E2M1, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const float f) {
+        __x = __nv_cvt_float_to_fp4(f, __NV_E2M1, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const double f) {
+        __x = __nv_cvt_double_to_fp4(f, __NV_E2M1, cudaRoundNearest);
+    }
+
+    /* Converts from integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p unsigned \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__
+    __nv_fp4_e2m1(const unsigned short int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p unsigned \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const unsigned int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p unsigned \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const unsigned long int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__
+    __nv_fp4_e2m1(const unsigned long long int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const short int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p long \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const long int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p long \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4_e2m1(const long long int val) {
+        __x = static_cast<__nv_fp4_e2m1>(static_cast<float>(val)).__x;
+    }
+
+#if !defined(__CUDA_NO_FP4_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator __half() const {
+        return static_cast<__half>(__nv_cvt_fp4_to_halfraw(__x, __NV_E2M1));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator float() const {
+        return __internal_halfraw_to_float(
+            __nv_cvt_fp4_to_halfraw(__x, __NV_E2M1));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator __nv_bfloat16() const {
+        return __float2bfloat16_rz(float(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator double() const {
+        return static_cast<double>(float(*this));
+    }
+
+    /* Convert to integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+
+        if (f < 0.0f) {
+            // saturate minimum
+            i = 0U;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator unsigned short int() const {
+        return __half2ushort_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator unsigned int() const {
+        return __half2uint_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p unsigned \p long \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator unsigned long int() const {
+        unsigned long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<unsigned long>(__half2ull_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<unsigned long>(__half2uint_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator unsigned long long int() const {
+        return __half2ull_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p signed \p char data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator signed char() const {
+        const float f = float(*this);
+        return static_cast<signed char>(f);
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * 
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+     * 
+     * Clamps inputs to the output range.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator char() const {
+        char value;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            value = static_cast<char>(static_cast<signed char>(*this));
+        }
+        else
+        {
+            value = static_cast<char>(static_cast<unsigned char>(*this));
+        }
+        return value;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator short int() const {
+        return __half2short_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator int() const {
+        return __half2int_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p long \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator long int() const {
+        long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<long>(__half2ll_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<long>(__half2int_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p long \p long \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator long long int() const {
+        return __half2ll_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator bool() const {
+        return (__x & 0x7U) != 0U;
+    }
+#endif /* !defined(__CUDA_NO_FP4_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP4_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP4X2_E2M1_STRUCT C++ struct for handling vector type of two fp4 values of e2m1 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP4
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP4X2_E2M1_STRUCT
+ * \brief __nv_fp4x2_e2m1 datatype
+ *
+ * \details This structure implements the datatype for handling two
+ * \p fp4 floating-point numbers of \p e2m1 kind each.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp4x2_e2m1 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP4X2_E2M1_STRUCT
+     * Storage variable contains the vector of two \p fp4 floating-point data
+     * values.
+     */
+    __nv_fp4x2_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP4)
+    __nv_fp4x2_e2m1() = default;
+#else
+    __CUDA_HOSTDEVICE_FP4__ __nv_fp4x2_e2m1() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP4) */
+
+#if !defined(__CUDA_NO_FP4_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x2_e2m1(const __half2 f) {
+        __x = __nv_cvt_halfraw2_to_fp4x2(static_cast<__half2_raw>(f),
+                                         __NV_E2M1, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x2_e2m1(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat16raw2_to_fp4x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_E2M1, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x2_e2m1(const float2 f) {
+        __x = __nv_cvt_float2_to_fp4x2(f, __NV_E2M1, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x2_e2m1(const double2 f) {
+        __x = __nv_cvt_double2_to_fp4x2(f, __NV_E2M1, cudaRoundNearest);
+    }
+
+#if !defined(__CUDA_NO_FP4_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator __half2() const {
+        return static_cast<__half2>(__nv_cvt_fp4x2_to_halfraw2(__x, __NV_E2M1));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator float2() const {
+        return __internal_halfraw2_to_float2(
+            __nv_cvt_fp4x2_to_halfraw2(__x, __NV_E2M1));
+    }
+#endif /* !defined(__CUDA_NO_FP4_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP4_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP4X4_E2M1_STRUCT C++ struct for handling vector type of four fp4 values of e2m1 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP4
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP4X4_E2M1_STRUCT
+ * \brief __nv_fp4x4_e2m1 datatype
+ *
+ * \details This structure implements the datatype for handling four
+ * \p fp4 floating-point numbers of \p e2m1 kind each.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp4x4_e2m1 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP4X4_E2M1_STRUCT
+     * Storage variable contains the vector of four \p fp4 floating-point data
+     * values.
+     */
+    __nv_fp4x4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP4)
+    __nv_fp4x4_e2m1() = default;
+#else
+    __CUDA_HOSTDEVICE_FP4__ __nv_fp4x4_e2m1() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP4) */
+
+#if !defined(__CUDA_NO_FP4_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x4_e2m1(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp4x2_storage_t rlo = __nv_cvt_halfraw2_to_fp4x2(
+            static_cast<__half2_raw>(flo), __NV_E2M1, cudaRoundNearest);
+        const __nv_fp4x2_storage_t rhi = __nv_cvt_halfraw2_to_fp4x2(
+            static_cast<__half2_raw>(fhi), __NV_E2M1, cudaRoundNearest);
+        __x = __internal_pack_u8x2_to_u16(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x4_e2m1(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp4x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp4x2(
+            static_cast<__nv_bfloat162_raw>(flo), __NV_E2M1, cudaRoundNearest);
+        const __nv_fp4x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp4x2(
+            static_cast<__nv_bfloat162_raw>(fhi), __NV_E2M1, cudaRoundNearest);
+        __x = __internal_pack_u8x2_to_u16(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x4_e2m1(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp4x2_storage_t rlo =
+            __nv_cvt_float2_to_fp4x2(flo, __NV_E2M1, cudaRoundNearest);
+        const __nv_fp4x2_storage_t rhi =
+            __nv_cvt_float2_to_fp4x2(fhi, __NV_E2M1, cudaRoundNearest);
+        __x = __internal_pack_u8x2_to_u16(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ __nv_fp4x4_e2m1(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp4x2_storage_t rlo =
+            __nv_cvt_double2_to_fp4x2(flo, __NV_E2M1, cudaRoundNearest);
+        const __nv_fp4x2_storage_t rhi =
+            __nv_cvt_double2_to_fp4x2(fhi, __NV_E2M1, cudaRoundNearest);
+        __x = __internal_pack_u8x2_to_u16(rlo, rhi);
+    }
+
+#if !defined(__CUDA_NO_FP4_CONVERSION_OPERATORS__)
+    /* Widening converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP4_MISC
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP4__ operator float4() const {
+        const __nv_fp4x2_storage_t slo = static_cast<__nv_fp4x2_storage_t>(__x);
+        const __nv_fp4x2_storage_t shi =
+            static_cast<__nv_fp4x2_storage_t>(__x >> 8U);
+        float2 rlo = __internal_halfraw2_to_float2(
+            __nv_cvt_fp4x2_to_halfraw2(slo, __NV_E2M1));
+        float2 rhi = __internal_halfraw2_to_float2(
+            __nv_cvt_fp4x2_to_halfraw2(shi, __NV_E2M1));
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP4_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP4_CONVERSIONS__) */
+};
+
+#endif /* defined(__cplusplus) */
+
+#endif /* end of include guard: __CUDA_FP4_HPP__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp6.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp6.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b0d6d5abf0781f4e378877d7395818898fa9ce0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp6.h
@@ -0,0 +1,362 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef __CUDA_FP6_H__
+#define __CUDA_FP6_H__
+
+/* Set up function decorations */
+#if defined(__CUDACC__)
+#define __CUDA_FP6_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_FP6__ __host__ __device__
+#define __CUDA_HOSTDEVICE_FP6_DECL__ static __host__ __device__ __inline__
+#else /* !defined(__CUDACC__) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_FP6_DECL__ static __attribute__((unused))
+#else
+#define __CUDA_HOSTDEVICE_FP6_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE_FP6__
+#endif /* defined(__CUDACC_) */
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP6
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP6
+#endif
+
+/* bring in fp8 types infrastructure and dependencies */
+#include "cuda_fp8.h"
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_FP6 FP6 Intrinsics
+ * This section describes fp6 intrinsic functions.
+ * To use these functions, include the header file \p cuda_fp6.h in your
+ * program.
+ *
+ * \note Most of the operations defined here benefit from native HW support
+ * when compiled for specific GPU targets (e.g. devices of compute capability 10.0a),
+ * other targets use emulation path.
+ *
+ * The following macros are available to help users selectively enable/disable
+ * various definitions present in the header file:
+ * - \p __CUDA_NO_FP6_CONVERSIONS__ - If defined, this macro will prevent any
+ * use of the C++ type conversions (converting constructors and conversion
+ * operators) defined in the header.
+ * - \p __CUDA_NO_FP6_CONVERSION_OPERATORS__ - If defined, this macro will
+ * prevent any use of the  C++ conversion operators from \p fp6 to other types.
+ */
+
+/**
+ * \defgroup CUDA_MATH_FP6_MISC FP6 Conversion and Data Movement
+ * \ingroup CUDA_MATH_INTRINSIC_FP6
+ * To use these functions, include the header file \p cuda_fp6.h in your
+ * program.
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief 8-bit \p unsigned \p integer
+ * type abstraction used for \p fp6 floating-point
+ * numbers storage.
+ */
+typedef __nv_fp8_storage_t __nv_fp6_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief 16-bit \p unsigned \p integer
+ * type abstraction used for storage of pairs of
+ * \p fp6 floating-point numbers.
+ */
+typedef __nv_fp8x2_storage_t __nv_fp6x2_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief 32-bit \p unsigned \p integer
+ * type abstraction used for storage of tetrads of
+ * \p fp6 floating-point numbers.
+ */
+typedef __nv_fp8x4_storage_t __nv_fp6x4_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Enumerates the possible
+ * interpretations of the 8-bit values when referring to them as
+ * \p fp6 types.
+ */
+typedef enum __nv_fp6_interpretation_t {
+    __NV_E2M3, /**< Stands for \p fp6 numbers of \p e2m3 kind. */
+    __NV_E3M2, /**< Stands for \p fp6 numbers of \p e3m2 kind. */
+} __nv_fp6_interpretation_t;
+
+/* Forward-declaration of C-style APIs */
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input \p double precision \p x to \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp6 type of the kind specified by
+ * \p fp6_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t
+__nv_cvt_double_to_fp6(const double x,
+                       const __nv_fp6_interpretation_t fp6_interpretation,
+                       const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input vector of two \p double precision numbers packed
+ * in \p double2 \p x into a vector of two values of \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp6 values of the
+ * kind specified by \p fp6_interpretation parameter, using
+ * rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t
+__nv_cvt_double2_to_fp6x2(const double2 x,
+                          const __nv_fp6_interpretation_t fp6_interpretation,
+                          const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input \p single precision \p x to \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp6 type of the kind specified by
+ * \p fp6_interpretation parameter, using
+ * rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t
+__nv_cvt_float_to_fp6(const float x,
+                      const __nv_fp6_interpretation_t fp6_interpretation,
+                      const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input vector of two \p single precision numbers packed
+ * in \p float2 \p x into a vector of two values of \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp6 values of the
+ * kind specified by \p fp6_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t
+__nv_cvt_float2_to_fp6x2(const float2 x,
+                         const __nv_fp6_interpretation_t fp6_interpretation,
+                         const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input \p half precision \p x to \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp6 type of the kind specified by
+ * \p fp6_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t
+__nv_cvt_halfraw_to_fp6(const __half_raw x,
+                        const __nv_fp6_interpretation_t fp6_interpretation,
+                        const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input vector of two \p half precision numbers packed
+ * in \p __half2_raw \p x into a vector of two values of \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp6 values of the
+ * kind specified by \p fp6_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t __nv_cvt_halfraw2_to_fp6x2(
+    const __half2_raw x,
+    const __nv_fp6_interpretation_t fp6_interpretation,
+    const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input \p nv_bfloat16 precision \p x to \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input \p x to \p fp6 type of the kind specified by
+ * \p fp6_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t __nv_cvt_bfloat16raw_to_fp6(
+    const __nv_bfloat16_raw x,
+    const __nv_fp6_interpretation_t fp6_interpretation,
+    const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input vector of two \p nv_bfloat16 precision numbers packed
+ * in \p __nv_bfloat162_raw \p x into a vector of two values of \p fp6 type of the
+ * requested kind using specified rounding mode and saturating
+ * the out-of-range values.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp6 values of the
+ * kind specified by \p fp6_interpretation parameter,
+ * using rounding mode specified by \p rounding parameter.
+ * Large out-of-range values saturate to MAXNORM of the same sign.
+ * \p NaN input values result in positive MAXNORM.
+ *
+ * \returns
+ * - The \p __nv_fp6x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp6x2(
+    const __nv_bfloat162_raw x,
+    const __nv_fp6_interpretation_t fp6_interpretation,
+    const enum cudaRoundMode rounding);
+
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input \p fp6 \p x of the specified kind
+ * to \p half precision.
+ *
+ * \details Converts input \p x of \p fp6 type of the kind specified by
+ * \p fp6_interpretation parameter
+ * to \p half precision.
+ *
+ * \returns
+ * - The \p __half_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __half_raw
+__nv_cvt_fp6_to_halfraw(const __nv_fp6_storage_t x,
+                        const __nv_fp6_interpretation_t fp6_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP6_MISC
+ * \brief Converts input vector of two \p fp6 values of the specified kind
+ * to a vector of two \p half precision values packed in \p __half2_raw
+ * structure.
+ *
+ * \details Converts input vector \p x of \p fp6 type of the kind specified by
+ * \p fp6_interpretation parameter
+ * to a vector of two \p half precision values and returns as \p __half2_raw
+ * structure.
+ *
+ * \returns
+ * - The \p __half2_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP6_DECL__ __half2_raw
+__nv_cvt_fp6x2_to_halfraw2(const __nv_fp6x2_storage_t x,
+                           const __nv_fp6_interpretation_t fp6_interpretation);
+
+#if defined(__cplusplus)
+
+#define __CUDA_FP6_TYPES_EXIST__
+
+/* Forward-declaration of structures defined in "cuda_fp6.hpp" */
+struct __nv_fp6_e3m2;
+struct __nv_fp6x2_e3m2;
+struct __nv_fp6x4_e3m2;
+
+struct __nv_fp6_e2m3;
+struct __nv_fp6x2_e2m3;
+struct __nv_fp6x4_e2m3;
+
+#endif /* defined(__cplusplus) */
+
+#include "cuda_fp6.hpp"
+
+#undef __CUDA_FP6_DECL__
+#undef __CUDA_HOSTDEVICE_FP6__
+#undef __CUDA_HOSTDEVICE_FP6_DECL__
+
+#if defined(__CPP_VERSION_AT_LEAST_11_FP6)
+#undef __CPP_VERSION_AT_LEAST_11_FP6
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+
+#endif /* end of include guard: __CUDA_FP6_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp6.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp6.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3848cebcc7629972a58c8a330a25195c5c565d4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp6.hpp
@@ -0,0 +1,1549 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_FP6_HPP__)
+#define __CUDA_FP6_HPP__
+
+#if !defined(__CUDA_FP6_H__)
+#error "Do not include this file directly. Instead, include cuda_fp6.h."
+#endif
+
+/* C++ header for std::memcpy (used for type punning in host-side
+ * implementations). When compiling as a CUDA source file memcpy is provided
+ * implicitly. !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#elif !defined(__cplusplus) && !defined(__CUDACC__)
+#include <string.h>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+/*
+ * Bring in the standard assertions header to enforce the subset
+ * of rounding modes supported by the APIs defined here.
+ * NOTE: NVRTC defines its own assert
+ */
+#if !defined (__CUDACC_RTC__)
+#include <assert.h>
+#endif
+
+/* Set up structure-alignment attribute */
+#if !(defined __CUDA_ALIGN__)
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas"
+ * is available) */
+#if __cplusplus >= 201103L
+#define __CUDA_ALIGN__(n)                                                      \
+    alignas(n) /* C++11 kindly gives us a keyword for this */
+#else          /* !defined(__CPP_VERSION_AT_LEAST_11_FP6)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+#endif /* defined(__CUDACC__) */
+#endif /* !(defined __CUDA_ALIGN__) */
+
+#if !(defined __CPP_VERSION_AT_LEAST_11_FP6)
+/* need c++11 for explicit operators */
+#define __CUDA_NO_FP6_CONVERSION_OPERATORS__
+#endif
+
+#if !(defined __DOXYGEN_ONLY__)
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t
+__nv_cvt_double_to_fp6(const double x,
+                       const __nv_fp6_interpretation_t fp6_interpretation,
+                       const enum cudaRoundMode rounding) {
+    unsigned char res;
+    unsigned long long int xbits;
+
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&xbits, &x, sizeof(x));
+#else
+    (void)std::memcpy(&xbits, &x, sizeof(x));
+#endif
+    unsigned char FP6_MAXNORM;
+    unsigned char FP6_MANTISSA_MASK;
+    unsigned short int FP6_EXP_BIAS;
+    unsigned long long int FP6_SIGNIFICAND_BITS;
+    unsigned long long int FP6_MINDENORM_O2;
+    unsigned long long int FP6_OVERFLOW_THRESHOLD;
+    unsigned long long int FP6_MINNORM;
+    const unsigned long long int DP_INF_BITS = 0x7FF0000000000000ULL;
+
+    switch (fp6_interpretation)
+    {
+        case __NV_E2M3:
+            FP6_EXP_BIAS = 1U;
+            FP6_SIGNIFICAND_BITS = 4ULL;
+            FP6_MANTISSA_MASK = 0x7U;
+            FP6_MINDENORM_O2 = 0x3FB0000000000000ULL; // mindenorm/2 = 2^-4
+            FP6_OVERFLOW_THRESHOLD =
+                0x401E000000000000ULL; // maxnorm = 7.5
+            FP6_MAXNORM = 0x1FU;
+            FP6_MINNORM = 0x3FF0000000000000ULL; // minnorm = 2^0
+            break;
+        case __NV_E3M2:
+        default:
+            FP6_EXP_BIAS = 3U;
+            FP6_SIGNIFICAND_BITS = 3ULL;
+            FP6_MANTISSA_MASK = 0x3U;
+            FP6_MINDENORM_O2 = 0x3FA0000000000000ULL; // mindenorm/2 = 2^-5
+            FP6_OVERFLOW_THRESHOLD =
+                0x403C000000000000ULL; // maxnorm = 28
+            FP6_MAXNORM = 0x1FU;
+            FP6_MINNORM = 0x3FD0000000000000ULL; // minnorm = 2^-2
+            break;
+    }
+
+    // 1/2 LSB of the target format, positioned in double precision mantissa
+    // helpful in midpoints detection during round-to-nearest-even step
+    const unsigned long long int FP6_DP_HALF_ULP =
+        (unsigned long long int)1ULL << (53ULL - FP6_SIGNIFICAND_BITS - 1ULL);
+    // prepare sign bit in target format
+    unsigned char sign = (unsigned char)((xbits >> 63ULL) << 5U);
+    // prepare exponent field in target format
+    unsigned char exp =
+        (unsigned char)((((unsigned short int)(xbits >> 52ULL)) & 0x7FFU) -
+                        1023U + FP6_EXP_BIAS);
+    // round mantissa to target format width, rounding towards zero
+    unsigned char mantissa =
+        (unsigned char)(xbits >> (53ULL - FP6_SIGNIFICAND_BITS)) &
+        FP6_MANTISSA_MASK;
+    unsigned long long int absx = xbits & 0x7FFFFFFFFFFFFFFFULL;
+
+    if (absx <= FP6_MINDENORM_O2) {
+        // zero or underflow
+        res = 0U;
+    } else if (absx > FP6_OVERFLOW_THRESHOLD) {
+        // overflow or NaN
+        if (absx > DP_INF_BITS)
+        {
+            // NaN converts to positive FP6_MAXNORM
+            sign = 0U;
+        }
+        res = FP6_MAXNORM;
+    } else if (absx >= FP6_MINNORM) {
+        res = (unsigned char)((exp << (FP6_SIGNIFICAND_BITS - 1U)) | mantissa);
+        // rounded-off bits
+        unsigned long long int round =
+            xbits & ((FP6_DP_HALF_ULP << 1ULL) - 1ULL);
+        if (rounding == cudaRoundNearest)
+        {
+            // round-to-nearest-even adjustment
+            if ((round > FP6_DP_HALF_ULP) ||
+                ((round == FP6_DP_HALF_ULP) && (mantissa & 1U))) {
+                res = (unsigned char)(res + 1U);
+            }
+        } else {
+            assert(rounding == cudaRoundZero);
+        }
+    } else // Denormal range
+    {
+        unsigned char shift = (unsigned char)(1U - exp);
+        // add implicit leading bit
+        mantissa |= (unsigned char)(1U << (FP6_SIGNIFICAND_BITS - 1U));
+        // additional round-off due to denormalization
+        res = (unsigned char)(mantissa >> shift);
+
+        if (rounding == cudaRoundNearest)
+        {
+            // rounded-off bits, including implicit leading bit
+            unsigned long long int round =
+                (xbits | ((unsigned long long int)1ULL << (53ULL - 1ULL))) &
+                ((FP6_DP_HALF_ULP << (shift + 1ULL)) - 1ULL);
+            // round-to-nearest-even adjustment
+            if ((round > (FP6_DP_HALF_ULP << shift)) ||
+                ((round == (FP6_DP_HALF_ULP << shift)) && (res & 1U))) {
+                res = (unsigned char)(res + 1U);
+            }
+        } else {
+            assert(rounding == cudaRoundZero);
+        }
+    }
+
+    res |= sign;
+
+    return (__nv_fp6_storage_t)res;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t
+__nv_cvt_double2_to_fp6x2(const double2 x,
+                          const __nv_fp6_interpretation_t fp6_interpretation,
+                          const enum cudaRoundMode rounding) {
+    __nv_fp6x2_storage_t storage = (__nv_fp6x2_storage_t)__nv_cvt_double_to_fp6(
+        x.y, fp6_interpretation, rounding);
+    storage = (__nv_fp6x2_storage_t)(storage << 8U);
+    storage = (__nv_fp6x2_storage_t)(storage |
+                                     __nv_cvt_double_to_fp6(
+                                         x.x, fp6_interpretation, rounding));
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t
+__nv_cvt_float_to_fp6(const float x,
+                      const __nv_fp6_interpretation_t fp6_interpretation,
+                      const enum cudaRoundMode rounding) {
+    __nv_fp6_storage_t res = 0U;
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    if (rounding == cudaRoundNearest)
+    {
+        __nv_fp6x2_storage_t storage;
+        if (fp6_interpretation == __NV_E3M2) {
+            asm("{cvt.rn.satfinite.e3m2x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x), "f"(0.0f));
+        } else {
+            asm("{cvt.rn.satfinite.e2m3x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x), "f"(0.0f));
+        }
+        res = (__nv_fp6_storage_t)storage;
+    } else
+#endif
+    {
+        res = __nv_cvt_double_to_fp6((double)x, fp6_interpretation, rounding);
+    }
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t
+__nv_cvt_float2_to_fp6x2(const float2 x,
+                         const __nv_fp6_interpretation_t fp6_interpretation,
+                         const enum cudaRoundMode rounding) {
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+    __nv_fp6x2_storage_t storage;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    if (rounding == cudaRoundNearest) {
+        if (fp6_interpretation == __NV_E3M2) {
+            asm("{cvt.rn.satfinite.e3m2x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x.x), "f"(x.y));
+        } else {
+            asm("{cvt.rn.satfinite.e2m3x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x.x), "f"(x.y));
+        }
+    } else
+#endif
+    {
+        storage = (__nv_fp6x2_storage_t)__nv_cvt_float_to_fp6(
+            x.y, fp6_interpretation, rounding);
+        storage = (__nv_fp6x2_storage_t)(storage << 8U);
+        storage = (__nv_fp6x2_storage_t)(storage | __nv_cvt_float_to_fp6(
+                                                       x.x,
+                                                       fp6_interpretation, rounding));
+    }
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t
+__nv_cvt_halfraw_to_fp6(const __half_raw x,
+                        const __nv_fp6_interpretation_t fp6_interpretation,
+                        const enum cudaRoundMode rounding) {
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+    __nv_fp6_storage_t res = 0U;
+    float fx = __internal_halfraw_to_float(x);
+    res = __nv_cvt_float_to_fp6(fx, fp6_interpretation, rounding);
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t __nv_cvt_halfraw2_to_fp6x2(
+    const __half2_raw x,
+    const __nv_fp6_interpretation_t fp6_interpretation,
+    const enum cudaRoundMode rounding) {
+    assert((rounding == cudaRoundNearest) || (rounding == cudaRoundZero));
+    __nv_fp6x2_storage_t tmp;
+    __half_raw raw;
+    raw.x = x.x;
+    __nv_fp6_storage_t lo =
+        __nv_cvt_halfraw_to_fp6(raw, fp6_interpretation, rounding);
+    raw.x = x.y;
+    __nv_fp6_storage_t hi =
+        __nv_cvt_halfraw_to_fp6(raw, fp6_interpretation, rounding);
+    tmp = hi;
+    tmp = (__nv_fp6x2_storage_t)(tmp << 8U);
+    tmp = (__nv_fp6x2_storage_t)(tmp | lo);
+    return tmp;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6_storage_t __nv_cvt_bfloat16raw_to_fp6(
+    const __nv_bfloat16_raw x,
+    const __nv_fp6_interpretation_t fp6_interpretation,
+    const enum cudaRoundMode rounding) {
+    const float fx = __internal_bf16raw_to_float(x);
+    const __nv_fp6_storage_t res =
+        __nv_cvt_float_to_fp6(fx, fp6_interpretation, rounding);
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __nv_fp6x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp6x2(
+    const __nv_bfloat162_raw x,
+    const __nv_fp6_interpretation_t fp6_interpretation,
+    const enum cudaRoundMode rounding) {
+    __nv_bfloat16_raw raw;
+    raw.x = x.y;
+    __nv_fp6x2_storage_t storage =
+        (__nv_fp6x2_storage_t)__nv_cvt_bfloat16raw_to_fp6(raw,
+                                        fp6_interpretation, rounding);
+    storage = (__nv_fp6x2_storage_t)(storage << 8U);
+    raw.x = x.x;
+    storage = (__nv_fp6x2_storage_t)(storage |
+                                     __nv_cvt_bfloat16raw_to_fp6(raw,
+                                        fp6_interpretation, rounding));
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __half2_raw
+__nv_cvt_fp6x2_to_halfraw2(const __nv_fp6x2_storage_t x,
+                           const __nv_fp6_interpretation_t fp6_interpretation);
+__CUDA_HOSTDEVICE_FP6_DECL__ __half_raw
+__nv_cvt_fp6_to_halfraw(const __nv_fp6_storage_t x,
+                        const __nv_fp6_interpretation_t fp6_interpretation) {
+    __half_raw res;
+    res.x = 0U;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    res.x =
+        __nv_cvt_fp6x2_to_halfraw2((__nv_fp6x2_storage_t)x, fp6_interpretation)
+            .x;
+#else
+    {
+        unsigned short int ur = (unsigned short int)x;
+        ur = (unsigned short int)(ur << 10U);
+
+        unsigned short int sign = ur & 0x8000U;
+        unsigned short int exponent;
+        unsigned short int bias_difference;
+        unsigned short int mantissa;
+
+        if (fp6_interpretation == __NV_E3M2) {
+            bias_difference = (unsigned short int)(15 - 3) << 10U;
+            exponent = (unsigned short int)(((ur & 0x7000U) >> 2U) + bias_difference);
+            mantissa = (ur & 0x0C00U) >> 2U;
+        } else {
+            //__NV_E2M3
+            bias_difference = (unsigned short int)(15 - 1) << 10U;
+            exponent = (unsigned short int)(((ur & 0x6000U) >> 3U) + bias_difference);
+            mantissa = (ur & 0x1C00U) >> 3U;
+        }
+
+        if (exponent == bias_difference) {
+            // zero or denormal
+            if (mantissa != 0U) {
+                // normalize
+                mantissa = (unsigned short int)(mantissa << 1U);
+                while ((mantissa & 0x0400U) == 0U) {
+                    mantissa = (unsigned short int)(mantissa << 1U);
+                    exponent = (unsigned short int)(exponent - 0x0400U);
+                }
+                // discard implicit leading bit
+                mantissa &= 0x03FFU;
+            } else { // Zero
+                exponent = 0U;
+            }
+        }
+
+        res.x = (sign | exponent) | mantissa;
+    }
+#endif
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP6_DECL__ __half2_raw
+__nv_cvt_fp6x2_to_halfraw2(const __nv_fp6x2_storage_t x,
+                           const __nv_fp6_interpretation_t fp6_interpretation) {
+    __half2_raw res;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    unsigned int half2_storage;
+    if (fp6_interpretation == __NV_E3M2) {
+        asm("{cvt.rn.f16x2.e3m2x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
+    } else {
+        asm("{cvt.rn.f16x2.e2m3x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
+    }
+    (void)memcpy(&res, &half2_storage, sizeof(half2_storage));
+#else
+    res.x =
+        __nv_cvt_fp6_to_halfraw((__nv_fp6_storage_t)x, fp6_interpretation).x;
+    res.y = __nv_cvt_fp6_to_halfraw((__nv_fp6_storage_t)(x >> 8U),
+                                    fp6_interpretation)
+                .x;
+#endif
+    return res;
+}
+
+#endif /* !(defined __DOXYGEN_ONLY__) */
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/**
+ * \defgroup CUDA_MATH_FP6_E3M2_STRUCT C++ struct for handling fp6 data type of e3m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP6
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP6_E3M2_STRUCT
+ * \brief __nv_fp6_e3m2 datatype
+ *
+ * \details This structure implements the datatype for handling
+ * \p fp6 floating-point numbers of \p e3m2 kind:
+ * with 1 sign, 3 exponent, 1 implicit and 2 explicit mantissa bits.
+ * This encoding does not support Inf/NaN.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp6_e3m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP6_E3M2_STRUCT
+     * Storage variable contains the \p fp6 floating-point data.
+     */
+    __nv_fp6_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP6)
+    __nv_fp6_e3m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+
+#if !defined(__CUDA_NO_FP6_CONVERSIONS__)
+
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const __half f) {
+        __x = __nv_cvt_halfraw_to_fp6(static_cast<__half_raw>(f),
+                                      __NV_E3M2, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_fp6(static_cast<__nv_bfloat16_raw>(f),
+                                          __NV_E3M2, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const float f) {
+        __x = __nv_cvt_float_to_fp6(f, __NV_E3M2, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const double f) {
+        __x = __nv_cvt_double_to_fp6(f, __NV_E3M2, cudaRoundNearest);
+    }
+
+    /* Converts from integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__
+    __nv_fp6_e3m2(const unsigned short int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const unsigned int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const unsigned long int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__
+    __nv_fp6_e3m2(const unsigned long long int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const short int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p long \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const long int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p long \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e3m2(const long long int val) {
+        __x = static_cast<__nv_fp6_e3m2>(static_cast<float>(val)).__x;
+    }
+
+#if !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator __half() const {
+        return static_cast<__half>(__nv_cvt_fp6_to_halfraw(__x, __NV_E3M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator float() const {
+        return __internal_halfraw_to_float(
+            __nv_cvt_fp6_to_halfraw(__x, __NV_E3M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator __nv_bfloat16() const {
+        return __float2bfloat16_rz(float(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator double() const {
+        return static_cast<double>(float(*this));
+    }
+
+    /* Convert to integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+
+        if (f < 0.0f) {
+            // saturate minimum
+            i = 0U;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned short int() const {
+        return __half2ushort_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned int() const {
+        return __half2uint_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p long \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned long int() const {
+        unsigned long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<unsigned long>(__half2ull_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<unsigned long>(__half2uint_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned long long int() const {
+        return __half2ull_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p signed \p char data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator signed char() const {
+        const float f = float(*this);
+        return static_cast<signed char>(f);
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * 
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+     * 
+     * Clamps inputs to the output range.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator char() const {
+        char value;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            value = static_cast<char>(static_cast<signed char>(*this));
+        }
+        else
+        {
+            value = static_cast<char>(static_cast<unsigned char>(*this));
+        }
+        return value;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator short int() const {
+        return __half2short_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator int() const {
+        return __half2int_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p long \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator long int() const {
+        long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<long>(__half2ll_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<long>(__half2int_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p long \p long \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator long long int() const {
+        return __half2ll_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator bool() const {
+        return (__x & 0x1FU) != 0U;
+    }
+#endif /* !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP6_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP6X2_E3M2_STRUCT C++ struct for handling vector type of two fp6 values of e3m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP6
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP6X2_E3M2_STRUCT
+ * \brief __nv_fp6x2_e3m2 datatype
+ *
+ * \details This structure implements the datatype for handling two
+ * \p fp6 floating-point numbers of \p e3m2 kind each.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp6x2_e3m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP6X2_E3M2_STRUCT
+     * Storage variable contains the vector of two \p fp6 floating-point data
+     * values.
+     */
+    __nv_fp6x2_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP6)
+    __nv_fp6x2_e3m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e3m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+
+#if !defined(__CUDA_NO_FP6_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e3m2(const __half2 f) {
+        __x = __nv_cvt_halfraw2_to_fp6x2(static_cast<__half2_raw>(f),
+                                         __NV_E3M2, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e3m2(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat16raw2_to_fp6x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_E3M2, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e3m2(const float2 f) {
+        __x = __nv_cvt_float2_to_fp6x2(f, __NV_E3M2, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e3m2(const double2 f) {
+        __x = __nv_cvt_double2_to_fp6x2(f, __NV_E3M2, cudaRoundNearest);
+    }
+
+#if !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator __half2() const {
+        return static_cast<__half2>(__nv_cvt_fp6x2_to_halfraw2(__x, __NV_E3M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator float2() const {
+        return __internal_halfraw2_to_float2(
+            __nv_cvt_fp6x2_to_halfraw2(__x, __NV_E3M2));
+    }
+#endif /* !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP6_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP6X4_E3M2_STRUCT C++ struct for handling vector type of four fp6 values of e3m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP6
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP6X4_E3M2_STRUCT
+ * \brief __nv_fp6x4_e3m2 datatype
+ *
+ * \details This structure implements the datatype for handling four
+ * \p fp6 floating-point numbers of \p e3m2 kind each.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(4) __nv_fp6x4_e3m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP6X4_E3M2_STRUCT
+     * Storage variable contains the vector of four \p fp6 floating-point data
+     * values.
+     */
+    __nv_fp6x4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP6)
+    __nv_fp6x4_e3m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e3m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+
+#if !defined(__CUDA_NO_FP6_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e3m2(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp6x2_storage_t rlo = __nv_cvt_halfraw2_to_fp6x2(
+            static_cast<__half2_raw>(flo), __NV_E3M2, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi = __nv_cvt_halfraw2_to_fp6x2(
+            static_cast<__half2_raw>(fhi), __NV_E3M2, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e3m2(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp6x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp6x2(
+            static_cast<__nv_bfloat162_raw>(flo), __NV_E3M2, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp6x2(
+            static_cast<__nv_bfloat162_raw>(fhi), __NV_E3M2, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e3m2(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp6x2_storage_t rlo =
+            __nv_cvt_float2_to_fp6x2(flo, __NV_E3M2, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi =
+            __nv_cvt_float2_to_fp6x2(fhi, __NV_E3M2, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e3m2(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp6x2_storage_t rlo =
+            __nv_cvt_double2_to_fp6x2(flo, __NV_E3M2, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi =
+            __nv_cvt_double2_to_fp6x2(fhi, __NV_E3M2, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+
+#if !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__)
+    /* Widening converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator float4() const {
+        const __nv_fp6x2_storage_t slo = static_cast<__nv_fp6x2_storage_t>(__x);
+        const __nv_fp6x2_storage_t shi =
+            static_cast<__nv_fp6x2_storage_t>(__x >> 16U);
+        float2 rlo = __internal_halfraw2_to_float2(
+            __nv_cvt_fp6x2_to_halfraw2(slo, __NV_E3M2));
+        float2 rhi = __internal_halfraw2_to_float2(
+            __nv_cvt_fp6x2_to_halfraw2(shi, __NV_E3M2));
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP6_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP6_E2M3_STRUCT C++ struct for handling fp6 data type of e2m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP6
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP6_E2M3_STRUCT
+ * \brief __nv_fp6_e2m3 datatype
+ *
+ * \details This structure implements the datatype for storing
+ * \p fp6 floating-point numbers of \p e2m3 kind:
+ * with 1 sign, 2 exponent, 1 implicit and 3 explicit mantissa bits.
+ * This encoding does not support Inf/NaN.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp6_e2m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP6_E2M3_STRUCT
+     * Storage variable contains the \p fp6 floating-point data.
+     */
+    __nv_fp6_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP6)
+    __nv_fp6_e2m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+
+#if !defined(__CUDA_NO_FP6_CONVERSIONS__)
+
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const __half f) {
+        __x = __nv_cvt_halfraw_to_fp6(static_cast<__half_raw>(f),
+                                      __NV_E2M3, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_fp6(static_cast<__nv_bfloat16_raw>(f),
+                                          __NV_E2M3, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const float f) {
+        __x = __nv_cvt_float_to_fp6(f, __NV_E2M3, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values and \p cudaRoundNearest rounding mode.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const double f) {
+        __x = __nv_cvt_double_to_fp6(f, __NV_E2M3, cudaRoundNearest);
+    }
+
+    /* Converts from integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__
+    __nv_fp6_e2m3(const unsigned short int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const unsigned int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const unsigned long int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__
+    __nv_fp6_e2m3(const unsigned long long int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const short int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p long \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const long int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p long \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6_e2m3(const long long int val) {
+        __x = static_cast<__nv_fp6_e2m3>(static_cast<float>(val)).__x;
+    }
+
+#if !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator __half() const {
+        return static_cast<__half>(__nv_cvt_fp6_to_halfraw(__x, __NV_E2M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator float() const {
+        return __internal_halfraw_to_float(
+            __nv_cvt_fp6_to_halfraw(__x, __NV_E2M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator __nv_bfloat16() const {
+        return __float2bfloat16_rz(float(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator double() const {
+        return static_cast<double>(float(*this));
+    }
+
+    /* Convert to integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+
+        if (f < 0.0f) {
+            // saturate minimum
+            i = 0U;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned short int() const {
+        return __half2ushort_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned int() const {
+        return __half2uint_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p long \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned long int() const {
+        unsigned long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<unsigned long>(__half2ull_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<unsigned long>(__half2uint_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps negative inputs to zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator unsigned long long int() const {
+        return __half2ull_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p signed \p char data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator signed char() const {
+        const float f = float(*this);
+        return static_cast<signed char>(f);
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * 
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+     * 
+     * Clamps inputs to the output range.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator char() const {
+        char value;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            value = static_cast<char>(static_cast<signed char>(*this));
+        }
+        else
+        {
+            value = static_cast<char>(static_cast<unsigned char>(*this));
+        }
+        return value;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator short int() const {
+        return __half2short_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator int() const {
+        return __half2int_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p long \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator long int() const {
+        long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<long>(__half2ll_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<long>(__half2int_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p long \p long \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator long long int() const {
+        return __half2ll_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator bool() const {
+        return (__x & 0x1FU) != 0U;
+    }
+#endif /* !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP6_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP6X2_E2M3_STRUCT C++ struct for handling vector type of two fp6 values of e2m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP6
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP6X2_E2M3_STRUCT
+ * \brief __nv_fp6x2_e2m3 datatype
+ *
+ * \details This structure implements the datatype for handling two
+ * \p fp6 floating-point numbers of \p e2m3 kind each.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp6x2_e2m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP6X2_E2M3_STRUCT
+     * Storage variable contains the vector of two \p fp6 floating-point data
+     * values.
+     */
+    __nv_fp6x2_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP6)
+    __nv_fp6x2_e2m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e2m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+
+#if !defined(__CUDA_NO_FP6_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e2m3(const __half2 f) {
+        __x = __nv_cvt_halfraw2_to_fp6x2(static_cast<__half2_raw>(f),
+                                         __NV_E2M3, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e2m3(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat16raw2_to_fp6x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_E2M3, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e2m3(const float2 f) {
+        __x = __nv_cvt_float2_to_fp6x2(f, __NV_E2M3, cudaRoundNearest);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x2_e2m3(const double2 f) {
+        __x = __nv_cvt_double2_to_fp6x2(f, __NV_E2M3, cudaRoundNearest);
+    }
+
+#if !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator __half2() const {
+        return static_cast<__half2>(__nv_cvt_fp6x2_to_halfraw2(__x, __NV_E2M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator float2() const {
+        return __internal_halfraw2_to_float2(
+            __nv_cvt_fp6x2_to_halfraw2(__x, __NV_E2M3));
+    }
+#endif /* !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP6_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP6X4_E2M3_STRUCT C++ struct for handling vector type of four fp6 values of e2m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP6
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP6X4_E2M3_STRUCT
+ * \brief __nv_fp6x4_e2m3 datatype
+ *
+ * \details This structure implements the datatype for handling four
+ * \p fp6 floating-point numbers of \p e2m3 kind each.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(4) __nv_fp6x4_e2m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP6X4_E2M3_STRUCT
+     * Storage variable contains the vector of four \p fp6 floating-point data
+     * values.
+     */
+    __nv_fp6x4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP6)
+    __nv_fp6x4_e2m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e2m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP6) */
+
+#if !defined(__CUDA_NO_FP6_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e2m3(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp6x2_storage_t rlo = __nv_cvt_halfraw2_to_fp6x2(
+            static_cast<__half2_raw>(flo), __NV_E2M3, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi = __nv_cvt_halfraw2_to_fp6x2(
+            static_cast<__half2_raw>(fhi), __NV_E2M3, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e2m3(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp6x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp6x2(
+            static_cast<__nv_bfloat162_raw>(flo), __NV_E2M3, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp6x2(
+            static_cast<__nv_bfloat162_raw>(fhi), __NV_E2M3, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e2m3(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp6x2_storage_t rlo =
+            __nv_cvt_float2_to_fp6x2(flo, __NV_E2M3, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi =
+            __nv_cvt_float2_to_fp6x2(fhi, __NV_E2M3, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ __nv_fp6x4_e2m3(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp6x2_storage_t rlo =
+            __nv_cvt_double2_to_fp6x2(flo, __NV_E2M3, cudaRoundNearest);
+        const __nv_fp6x2_storage_t rhi =
+            __nv_cvt_double2_to_fp6x2(fhi, __NV_E2M3, cudaRoundNearest);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+
+#if !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__)
+    /* Widening converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP6_MISC
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP6__ operator float4() const {
+        const __nv_fp6x2_storage_t slo = static_cast<__nv_fp6x2_storage_t>(__x);
+        const __nv_fp6x2_storage_t shi =
+            static_cast<__nv_fp6x2_storage_t>(__x >> 16U);
+        float2 rlo = __internal_halfraw2_to_float2(
+            __nv_cvt_fp6x2_to_halfraw2(slo, __NV_E2M3));
+        float2 rhi = __internal_halfraw2_to_float2(
+            __nv_cvt_fp6x2_to_halfraw2(shi, __NV_E2M3));
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP6_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP6_CONVERSIONS__) */
+};
+
+#endif /* defined(__cplusplus) */
+
+#endif /* end of include guard: __CUDA_FP6_HPP__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp8.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp8.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..05d673704c5309d45361855e8425ab18566d38c5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp8.hpp
@@ -0,0 +1,2728 @@
+/*
+ * Copyright 2022-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_FP8_HPP__)
+#define __CUDA_FP8_HPP__
+
+#if !defined(__CUDA_FP8_H__)
+#error "Do not include this file directly. Instead, include cuda_fp8.h."
+#endif
+
+/* C++ header for std::memcpy (used for type punning in host-side
+ * implementations). When compiling as a CUDA source file memcpy is provided
+ * implicitly. !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#elif !defined(__cplusplus) && !defined(__CUDACC__)
+#include <string.h>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+#include <nv/target>
+#endif  /* !defined(__CUDACC_RTC__) */
+
+#if !defined(IF_DEVICE_OR_CUDACC)
+#if defined(__CUDACC__)
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, c)
+#else
+    #define IF_DEVICE_OR_CUDACC(d, c, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, d, f)
+#endif
+#endif
+
+/*
+ * Bring in the standard assertions header to enforce the subset
+ * of rounding modes supported by the APIs defined here.
+ * NOTE: NVRTC defines its own assert
+ */
+#if !defined (__CUDACC_RTC__)
+#include <assert.h>
+#endif
+
+/* Set up structure-alignment attribute */
+#if !(defined __CUDA_ALIGN__)
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas"
+ * is available) */
+#if __cplusplus >= 201103L
+#define __CUDA_ALIGN__(n)                                                      \
+    alignas(n) /* C++11 kindly gives us a keyword for this */
+#else          /* !defined(__CPP_VERSION_AT_LEAST_11_FP8)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+#endif /* defined(__CUDACC__) */
+#endif /* !(defined __CUDA_ALIGN__) */
+
+#if !(defined __CPP_VERSION_AT_LEAST_11_FP8)
+/* need c++11 for explicit operators */
+#define __CUDA_NO_FP8_CONVERSION_OPERATORS__
+#endif
+
+#if !(defined __DOXYGEN_ONLY__)
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate,
+                       const __nv_fp8_interpretation_t fp8_interpretation) {
+    unsigned char res;
+    unsigned long long int xbits;
+
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&xbits, &x, sizeof(x));
+#else
+    (void)std::memcpy(&xbits, &x, sizeof(x));
+#endif
+    unsigned char FP8_MAXNORM;
+    unsigned char FP8_MANTISSA_MASK;
+    unsigned short int FP8_EXP_BIAS;
+    unsigned long long int FP8_SIGNIFICAND_BITS;
+    const unsigned long long int DP_INF_BITS = 0x7FF0000000000000ULL;
+    unsigned long long int FP8_MINDENORM_O2;
+    unsigned long long int FP8_OVERFLOW_THRESHOLD;
+    unsigned long long int FP8_MINNORM;
+
+    if (fp8_interpretation == __NV_E4M3) {
+        FP8_EXP_BIAS = 7U;
+        FP8_SIGNIFICAND_BITS = 4ULL;
+        FP8_MANTISSA_MASK = 0x7U;
+        FP8_MINDENORM_O2 = 0x3F50000000000000ULL; // mindenorm/2 = 2^-10
+        FP8_OVERFLOW_THRESHOLD =
+            0x407D000000000000ULL; // maxnorm + 1/2ulp = 0x1.Cp+8 + 0x1p+4
+        FP8_MAXNORM = 0x7EU;
+        FP8_MINNORM = 0x3F90000000000000ULL; // minnorm = 2^-6
+    } else {                                 //__NV_E5M2
+        FP8_EXP_BIAS = 15U;
+        FP8_SIGNIFICAND_BITS = 3ULL;
+        FP8_MANTISSA_MASK = 0x3U;
+        FP8_MINDENORM_O2 = 0x3EE0000000000000ULL; // mindenorm/2 = 2^-17
+        FP8_OVERFLOW_THRESHOLD =
+            0x40EE000000000000ULL -
+            1ULL; // maxnorm + 1/2ulp = 0x1.Ep+15, and -1 to have common code
+        FP8_MAXNORM = 0x7BU;
+        FP8_MINNORM = 0x3F10000000000000ULL; // minnorm = 2^-14
+    }
+
+    // 1/2 LSB of the target format, positioned in double precision mantissa
+    // helpful in midpoints detection during round-to-nearest-even step
+    const unsigned long long int FP8_DP_HALF_ULP =
+        (unsigned long long int)1ULL << (53ULL - FP8_SIGNIFICAND_BITS - 1ULL);
+    // prepare sign bit in target format
+    unsigned char sign = (unsigned char)((xbits >> 63ULL) << 7U);
+    // prepare exponent field in target format
+    unsigned char exp =
+        (unsigned char)((((unsigned short int)(xbits >> 52ULL)) & 0x7FFU) -
+                        1023U + FP8_EXP_BIAS);
+    // round mantissa to target format width, rounding towards zero
+    unsigned char mantissa =
+        (unsigned char)(xbits >> (53ULL - FP8_SIGNIFICAND_BITS)) &
+        FP8_MANTISSA_MASK;
+    unsigned long long int absx = xbits & 0x7FFFFFFFFFFFFFFFULL;
+
+    if (absx <= FP8_MINDENORM_O2) {
+        // zero or underflow
+        res = 0U;
+    } else if (absx > DP_INF_BITS) {
+        // NaN
+        if (fp8_interpretation == __NV_E4M3) {
+            res = 0x7FU;
+        } else {
+            // NaN --> QNaN
+            res = 0x7EU | mantissa;
+        }
+    } else if (absx > FP8_OVERFLOW_THRESHOLD) {
+        if (saturate == __NV_SATFINITE) {
+            res = FP8_MAXNORM;
+        } else {
+            // __NV_NOSAT
+            if (fp8_interpretation == __NV_E4M3) {
+                // no Inf in E4M3
+                res = 0x7FU; // NaN
+            } else {
+                res = 0x7CU; // Inf in E5M2
+            }
+        }
+    } else if (absx >= FP8_MINNORM) {
+        res = (unsigned char)((exp << (FP8_SIGNIFICAND_BITS - 1U)) | mantissa);
+        // rounded-off bits
+        unsigned long long int round =
+            xbits & ((FP8_DP_HALF_ULP << 1ULL) - 1ULL);
+        // round-to-nearest-even adjustment
+        if ((round > FP8_DP_HALF_ULP) ||
+            ((round == FP8_DP_HALF_ULP) && (mantissa & 1U))) {
+            res = (unsigned char)(res + 1U);
+        }
+    } else // Denormal range
+    {
+        unsigned char shift = (unsigned char)(1U - exp);
+        // add implicit leading bit
+        mantissa |= (unsigned char)(1U << (FP8_SIGNIFICAND_BITS - 1U));
+        // additional round-off due to denormalization
+        res = (unsigned char)(mantissa >> shift);
+
+        // rounded-off bits, including implicit leading bit
+        unsigned long long int round =
+            (xbits | ((unsigned long long int)1ULL << (53ULL - 1ULL))) &
+            ((FP8_DP_HALF_ULP << (shift + 1ULL)) - 1ULL);
+        // round-to-nearest-even adjustment
+        if ((round > (FP8_DP_HALF_ULP << shift)) ||
+            ((round == (FP8_DP_HALF_ULP << shift)) && (res & 1U))) {
+            res = (unsigned char)(res + 1U);
+        }
+    }
+
+    res |= sign;
+
+    return (__nv_fp8_storage_t)res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate,
+                          const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8x2_storage_t storage = (__nv_fp8x2_storage_t)__nv_cvt_double_to_fp8(
+        x.y, saturate, fp8_interpretation);
+    storage = (__nv_fp8x2_storage_t)(storage << 8U);
+    storage = (__nv_fp8x2_storage_t)(storage |
+                                     __nv_cvt_double_to_fp8(
+                                         x.x, saturate, fp8_interpretation));
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate,
+                      const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8_storage_t res = 0U;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+    if (saturate == __NV_SATFINITE) {
+        __nv_fp8x2_storage_t storage;
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x), "f"(0.0f));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x), "f"(0.0f));
+        }
+        res = (__nv_fp8_storage_t)storage;
+    } else
+#endif
+    {
+        unsigned int xbits;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+        (void)memcpy(&xbits, &x, sizeof(x));
+#else
+        (void)std::memcpy(&xbits, &x, sizeof(x));
+#endif
+
+        // isnan
+        if ((xbits & 0x7FFFFFFFU) > 0x7F800000U) {
+            // Canonical NaN
+            xbits = 0x7FFFFFFFU;
+        }
+
+        float fx;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+        (void)memcpy(&fx, &xbits, sizeof(xbits));
+#else
+        (void)std::memcpy(&fx, &xbits, sizeof(xbits));
+#endif
+
+        const double dx = (double)fx;
+        res = __nv_cvt_double_to_fp8(dx, saturate, fp8_interpretation);
+    }
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate,
+                         const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8x2_storage_t storage;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+    if (saturate == __NV_SATFINITE) {
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x.x), "f"(x.y));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x.x), "f"(x.y));
+        }
+    } else
+#endif
+    {
+        storage = (__nv_fp8x2_storage_t)__nv_cvt_float_to_fp8(
+            x.y, saturate, fp8_interpretation);
+        storage = (__nv_fp8x2_storage_t)(storage << 8U);
+        storage = (__nv_fp8x2_storage_t)(storage | __nv_cvt_float_to_fp8(
+                                                       x.x, saturate,
+                                                       fp8_interpretation));
+    }
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ float
+__internal_halfraw_to_float(const __half_raw x) {
+    float f;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    asm("{cvt.f32.f16 %0, %1;}\n" : "=f"(f) : "h"(x.x));
+#else
+    const unsigned int ux = (unsigned int)x.x;
+    unsigned int sign = (ux >> 15U) & 1U;
+    unsigned int exponent = (ux >> 10U) & 0x1fU;
+    unsigned int mantissa = (ux & 0x3ffU) << 13U;
+    if (exponent == 0x1fU) { /* NaN or Inf */
+        /* discard sign of a NaN */
+        sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
+        mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
+        exponent = 0xffU;
+    } else if (exponent == 0U) { /* Denorm or Zero */
+        if (mantissa != 0U) {
+            unsigned int msb;
+            exponent = 0x71U;
+            do {
+                msb = (mantissa & 0x400000U);
+                mantissa <<= 1U; /* normalize */
+                --exponent;
+            } while (msb == 0U);
+            mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
+        }
+    } else {
+        exponent += 0x70U;
+    }
+    const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&f, &u, sizeof(u));
+#else
+    (void)std::memcpy(&f, &u, sizeof(u));
+#endif
+#endif /* (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) */
+    return f;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ float2
+__internal_halfraw2_to_float2(const __half2_raw x) {
+    __half_raw raw;
+    float2 res;
+    raw.x = x.x;
+    res.x = __internal_halfraw_to_float(raw);
+    raw.x = x.y;
+    res.y = __internal_halfraw_to_float(raw);
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate,
+                        const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8_storage_t res = 0U;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+    if (saturate == __NV_SATFINITE) {
+        unsigned int half2_storage = (unsigned int)(x.x);
+        __nv_fp8x2_storage_t tmp;
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        }
+        res = (__nv_fp8_storage_t)tmp;
+    } else
+#endif
+    {
+        float fx = __internal_halfraw_to_float(x);
+        res = __nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation);
+    }
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2(
+    const __half2_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8x2_storage_t tmp;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+    if (saturate == __NV_SATFINITE) {
+        unsigned int half2_storage;
+        (void)memcpy(&half2_storage, &x, sizeof(x));
+
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        }
+    } else
+#endif
+    {
+        __half_raw raw;
+        raw.x = x.x;
+        __nv_fp8_storage_t lo =
+            __nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation);
+        raw.x = x.y;
+        __nv_fp8_storage_t hi =
+            __nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation);
+        tmp = hi;
+        tmp = (__nv_fp8x2_storage_t)(tmp << 8U);
+        tmp = (__nv_fp8x2_storage_t)(tmp | lo);
+    }
+    return tmp;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ float
+__internal_bf16raw_to_float(const __nv_bfloat16_raw x) {
+    const unsigned int ux = ((unsigned int)x.x) << 16U;
+    float fx;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&fx, &ux, sizeof(ux));
+#else
+    (void)std::memcpy(&fx, &ux, sizeof(ux));
+#endif
+    return fx;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8(
+    const __nv_bfloat16_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation) {
+    const float fx = __internal_bf16raw_to_float(x);
+    const __nv_fp8_storage_t res =
+        __nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation);
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp8x2(
+    const __nv_bfloat162_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_bfloat16_raw raw;
+    raw.x = x.y;
+    __nv_fp8x2_storage_t storage =
+        (__nv_fp8x2_storage_t)__nv_cvt_bfloat16raw_to_fp8(raw, saturate,
+                                                          fp8_interpretation);
+    storage = (__nv_fp8x2_storage_t)(storage << 8U);
+    raw.x = x.x;
+    storage = (__nv_fp8x2_storage_t)(storage |
+                                     __nv_cvt_bfloat16raw_to_fp8(
+                                         raw, saturate, fp8_interpretation));
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
+__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
+                           const __nv_fp8_interpretation_t fp8_interpretation);
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __half_raw
+__nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x,
+                        const __nv_fp8_interpretation_t fp8_interpretation) {
+    __half_raw res;
+    res.x = 0U;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+    res.x =
+        __nv_cvt_fp8x2_to_halfraw2((__nv_fp8x2_storage_t)x, fp8_interpretation)
+            .x;
+#else
+    unsigned short int ur = (unsigned short int)x;
+    ur = (unsigned short int)(ur << 8U);
+
+    if (fp8_interpretation == __NV_E5M2) {
+        if ((ur & 0x7FFFU) > 0x7C00U) {
+            /* If NaN, return canonical NaN */
+            ur = 0x7FFFU;
+        }
+    } else { // __NV_E4M3
+        unsigned short int sign = ur & 0x8000U;
+        unsigned short int exponent =
+            (unsigned short int)(((ur & 0x7800U) >> 1U) + 0x2000U);
+        unsigned short int mantissa = (ur & 0x0700U) >> 1U;
+        unsigned char absx = 0x7FU & (unsigned char)x;
+
+        if (absx == 0x7FU) // NaN
+        {
+            ur = 0x7FFFU; // fp16 canonical NaN, discard sign
+        } else if (exponent == 0x2000U) {
+            // zero or denormal
+            if (mantissa != 0U) {
+                // normalize
+                mantissa = (unsigned short int)(mantissa << 1U);
+                while ((mantissa & 0x0400U) == 0U) {
+                    mantissa = (unsigned short int)(mantissa << 1U);
+                    exponent = (unsigned short int)(exponent - 0x0400U);
+                }
+                // discard implicit leading bit
+                mantissa &= 0x03FFU;
+            } else { // Zero
+                exponent = 0U;
+            }
+
+            ur = (sign | exponent) | mantissa;
+        } else {
+            ur = (sign | exponent) | mantissa;
+        }
+    }
+    res.x = ur;
+#endif
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
+__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
+                           const __nv_fp8_interpretation_t fp8_interpretation) {
+    __half2_raw res;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
+    unsigned int half2_storage;
+    if (fp8_interpretation == __NV_E5M2) {
+        asm("{cvt.rn.f16x2.e5m2x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
+    } else {
+        asm("{cvt.rn.f16x2.e4m3x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
+    }
+    (void)memcpy(&res, &half2_storage, sizeof(half2_storage));
+#else
+    res.x =
+        __nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)x, fp8_interpretation).x;
+    res.y = __nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)(x >> 8U),
+                                    fp8_interpretation)
+                .x;
+#endif
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_fp8_storage_t __nv_cvt_bfloat16raw_to_e8m0(const __nv_bfloat16_raw x,
+    const __nv_saturation_t saturate, const enum cudaRoundMode rounding)
+{
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    unsigned short ures = 0U;
+    unsigned in = (unsigned)(x.x);
+    if ((rounding == cudaRoundZero) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rz.satfinite.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    } else if ((rounding == cudaRoundZero) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rz.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rp.satfinite.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rp.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    }
+    return (__nv_fp8_storage_t)ures;
+#else
+    // extract exponent bits, provides non-saturated result in RZ
+    __nv_fp8_storage_t res = (unsigned char)(x.x >> 7U);
+
+    if (rounding == cudaRoundPosInf) {
+        // round-up if mantissa non-zero and |x| > 2^-127 and finite
+        if ((x.x & 0x007FU) && ((x.x & 0x7FFFU) > 0x0040U) && ((x.x & 0x7FFFU) < 0x7F80U)) res++;
+    }
+
+    // Handle saturation of non-NaN large inputs to finite
+    if (saturate == __NV_SATFINITE) {
+        // non-NaN, Overflow --> Max
+        if (((x.x & 0x7FFFU) <= 0x7F80U) && (res == 0xFFU))
+        {
+            res--;
+        }
+    }
+    return res;
+#endif
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_fp8x2_storage_t __nv_cvt_bfloat162raw_to_e8m0x2(const __nv_bfloat162_raw x,
+    const __nv_saturation_t saturate, const enum cudaRoundMode rounding)
+{
+    assert((rounding == cudaRoundZero) || (rounding == cudaRoundPosInf));
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    __nv_fp8x2_storage_t ures = 0U;
+    unsigned in = (unsigned)(x.x) | ((unsigned)(x.y) << (unsigned)16U);
+    if ((rounding == cudaRoundZero) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rz.satfinite.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    } else if ((rounding == cudaRoundZero) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rz.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rp.satfinite.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rp.ue8m0x2.bf16x2 %0, %1;}\n"
+                : "=h"(ures)
+                : "r"(in));
+    }
+    return ures;
+#else
+    __nv_bfloat16_raw lo, hi;
+    lo.x = x.x;
+    hi.x = x.y;
+    __nv_fp8x2_storage_t ures = __nv_cvt_bfloat16raw_to_e8m0(hi, saturate, rounding);
+    ures <<= (unsigned short)8U;
+    ures |= __nv_cvt_bfloat16raw_to_e8m0(lo, saturate, rounding);
+    return ures;
+#endif
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ unsigned int __internal_fp8_float_as_uint(const float f)
+{
+    unsigned int u;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&u, &f, sizeof(f));
+#else
+    (void)::std::memcpy(&u, &f, sizeof(f));
+#endif
+    return u;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ float __internal_fp8_uint_as_float(const unsigned int u)
+{
+    float f;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&f, &u, sizeof(u));
+#else
+    (void)::std::memcpy(&f, &u, sizeof(u));
+#endif
+    return f;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat16_raw
+__internal_float_to_bf16raw_rz(const float x) {
+    __nv_bfloat16_raw r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,
+    asm("{  cvt.rz.bf16.f32 %0, %1;}\n" : "=h"(r.x) : "f"(x));
+,
+    unsigned int ux = __internal_fp8_float_as_uint(x);
+    if ((ux & 0x7FFFFFFFU) > 0x7f800000U)
+    {
+        // NaN
+        r.x = (unsigned short int)0x7FFFU;
+    }
+    else
+    {
+        r.x = (unsigned short int)(ux >> 16U);
+    }
+)
+    return r;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat16_raw
+__internal_float_to_bf16raw_ru(const float x) {
+    __nv_bfloat16_raw r;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,
+    asm("{  cvt.rp.bf16.f32 %0, %1;}\n" : "=h"(r.x) : "f"(x));
+,
+    unsigned int ux = __internal_fp8_float_as_uint(x);
+    if ((ux & 0x7FFFFFFFU) > 0x7f800000U)
+    {
+        // NaN
+        r.x = (unsigned short int)0x7FFFU;
+    }
+    else if ((ux < 0x7f800000U) && ((ux & 0x0000FFFFU) != 0))
+    {
+        // 0 <= x < +inf, round-up
+        r.x = (unsigned short int)((ux >> 16U) + 1U);
+    }
+    else {
+        // truncate others
+        r.x = (unsigned short int)(ux >> 16U);
+    }
+)
+    return r;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_fp8_storage_t __nv_cvt_float_to_e8m0(const float x,
+    const __nv_saturation_t saturate, const enum cudaRoundMode rounding)
+{
+    assert((rounding == cudaRoundZero) || (rounding == cudaRoundPosInf));
+    __nv_fp8_storage_t res = 0U;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    unsigned short ures = 0U;
+    if ((rounding == cudaRoundZero) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rz.satfinite.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(ures)
+                : "f"(x), "f"(0.0f));
+    } else if ((rounding == cudaRoundZero) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rz.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(ures)
+                : "f"(x), "f"(0.0f));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rp.satfinite.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(ures)
+                : "f"(x), "f"(0.0f));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rp.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(ures)
+                : "f"(x), "f"(0.0f));
+    }
+    res = (__nv_fp8_storage_t)ures;
+#else
+    if (rounding == cudaRoundZero)
+    {
+        res = __nv_cvt_bfloat16raw_to_e8m0(__internal_float_to_bf16raw_rz(x), saturate, rounding);
+    }
+    else
+    {   //cudaRoundPosInf
+        float absx = __internal_fp8_uint_as_float((__internal_fp8_float_as_uint(x) << (unsigned)1U) >> (unsigned)1U);
+        res = __nv_cvt_bfloat16raw_to_e8m0(__internal_float_to_bf16raw_ru(absx), saturate, rounding);
+    }
+#endif
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_fp8x2_storage_t __nv_cvt_float2_to_e8m0x2(const float2 x,
+    const __nv_saturation_t saturate, const enum cudaRoundMode rounding)
+{
+    assert((rounding == cudaRoundZero) || (rounding == cudaRoundPosInf));
+    __nv_fp8x2_storage_t res = 0U;
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    if ((rounding == cudaRoundZero) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rz.satfinite.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(res)
+                : "f"(x.x), "f"(x.y));
+    } else if ((rounding == cudaRoundZero) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rz.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(res)
+                : "f"(x.x), "f"(x.y));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_SATFINITE)) {
+        asm("{cvt.rp.satfinite.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(res)
+                : "f"(x.x), "f"(x.y));
+    } else if ((rounding == cudaRoundPosInf) && (saturate == __NV_NOSAT)) {
+        asm("{cvt.rp.ue8m0x2.f32 %0, %2, %1;}\n"
+                : "=h"(res)
+                : "f"(x.x), "f"(x.y));
+    }
+#else
+    res = __nv_cvt_float_to_e8m0(x.y, saturate, rounding);
+    res <<= (unsigned short)8U;
+    res |= __nv_cvt_float_to_e8m0(x.x, saturate, rounding);
+#endif
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+float __internal_double_to_float_with_sticky(double x)
+{
+#if (defined __CUDA_ARCH__)
+    // protect from ftz in device code
+    float f;
+    double d;
+    asm("{  cvt.rn.f32.f64 %0, %1;}\n" : "=f"(f) : "d"(x));
+    asm("{  cvt.f64.f32 %0, %1;}\n" : "=d"(d) : "f"(f));
+#else
+    const float f = (float)x;
+    const double d = (double)f;
+#endif
+    unsigned int u = __internal_fp8_float_as_uint(f);
+    int x_is_not_nan = ((u << (unsigned)1U) <= (unsigned)0xFF000000U) ? 1 : 0;
+
+    if ((x > 0.0) && (d > x)) {
+        u--;
+    }
+    if ((x < 0.0) && (d < x)) {
+        u--;
+    }
+    if ((d != x) && (x_is_not_nan == 1)) {
+        u |= 1U;
+    }
+    return __internal_fp8_uint_as_float(u);
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_fp8_storage_t __nv_cvt_double_to_e8m0(const double x,
+    const __nv_saturation_t saturate, const enum cudaRoundMode rounding)
+{
+    float fx_with_sticky = __internal_double_to_float_with_sticky(x);
+    __nv_fp8_storage_t res = __nv_cvt_float_to_e8m0(fx_with_sticky, saturate, rounding);
+
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_fp8x2_storage_t __nv_cvt_double2_to_e8m0x2(const double2 x,
+    const __nv_saturation_t saturate, const enum cudaRoundMode rounding)
+{
+    float2 f;
+    f.x = __internal_double_to_float_with_sticky(x.x);
+    f.y = __internal_double_to_float_with_sticky(x.y);
+    return __nv_cvt_float2_to_e8m0x2(f, saturate, rounding);
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+unsigned short __internal_e8m0_to_bf16(const __nv_fp8_storage_t x)
+{
+    unsigned short res;
+    // shift bias exponent bits into place
+    res = ((unsigned short)x) << 7U;
+
+    if (x == 0xFFU) {
+        res = 0x7FFFU; // NaN --> Canonical QNaN
+    } else if (x == 0U) {
+        res = 0x0040U; // 2^-127
+    }
+
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_bfloat16_raw __nv_cvt_e8m0_to_bf16raw(const __nv_fp8_storage_t x)
+{
+    __nv_bfloat16_raw res;
+
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    unsigned short in = (unsigned short)x;
+    unsigned hr = 0U;
+    asm("{cvt.rn.bf16x2.ue8m0x2 %0, %1;}\n"
+                : "=r"(hr)
+                : "h"(in));
+
+    res.x = (unsigned short)hr;
+#else
+    res.x = __internal_e8m0_to_bf16(x);
+#endif
+
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__
+__nv_bfloat162_raw __nv_cvt_e8m0x2_to_bf162raw(const __nv_fp8x2_storage_t x)
+{
+    __nv_bfloat162_raw res;
+
+#if ((defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && \
+     ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
+      (__CUDA_ARCH_HAS_FEATURE__(SM120_ALL))))
+    unsigned short in = (unsigned short)x;
+    unsigned hr = 0U;
+    asm("{cvt.rn.bf16x2.ue8m0x2 %0, %1;}\n"
+                : "=r"(hr)
+                : "h"(in));
+
+    res.x = (unsigned short)hr;
+    res.y = (unsigned short)(hr >> (unsigned)16U);
+#else
+
+    res.x = __internal_e8m0_to_bf16((__nv_fp8_storage_t)x);
+    res.y = __internal_e8m0_to_bf16((__nv_fp8_storage_t)(x >> (unsigned short)8U));
+
+#endif
+
+    return res;
+}
+
+#endif /* !(defined __DOXYGEN_ONLY__) */
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/**
+ * \defgroup CUDA_MATH_FP8_E5M2_STRUCT C++ struct for handling fp8 data type of e5m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+ * \brief __nv_fp8_e5m2 datatype
+ *
+ * \details This structure implements the datatype for handling
+ * \p fp8 floating-point numbers of \p e5m2 kind:
+ * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp8_e5m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Storage variable contains the \p fp8 floating-point data.
+     */
+    __nv_fp8_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8_e5m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __half f) {
+        __x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f),
+                                      __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f),
+                                          __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const float f) {
+        __x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const double f) {
+        __x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E5M2);
+    }
+
+    /* Converts from integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e5m2(const unsigned short int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const unsigned int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const unsigned long int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e5m2(const unsigned long long int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const short int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p long \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const long int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p long \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const long long int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
+        return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E5M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
+        return __internal_halfraw_to_float(
+            __nv_cvt_fp8_to_halfraw(__x, __NV_E5M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
+        return __float2bfloat16_rz(float(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
+        return static_cast<double>(float(*this));
+    }
+
+    /* Convert to integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+        const unsigned char max_val = 0xFFU;
+        const unsigned char min_val = 0U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) > 0x7CU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
+        return __half2ushort_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
+        return __half2uint_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p long \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero if output type is 32-bit.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long int() const {
+        unsigned long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<unsigned long>(__half2ull_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<unsigned long>(__half2uint_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
+        return __half2ull_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p signed \p char data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
+        signed char i;
+        const float f = float(*this);
+        const signed char max_val = (signed char)0x7FU;
+        const signed char min_val = (signed char)0x80U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) > 0x7CU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<signed char>(f);
+        }
+        return i;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * 
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+
+     * Clamps inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator char() const {
+        char value;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            value = static_cast<char>(static_cast<signed char>(*this));
+        }
+        else
+        {
+            value = static_cast<char>(static_cast<unsigned char>(*this));
+        }
+        return value;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p short \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
+        return __half2short_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
+        return __half2int_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero if output type is 32-bit.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long int() const {
+        long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<long>(__half2ll_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<long>(__half2int_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p long \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p 0x8000000000000000LL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
+        return __half2ll_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
+        return (__x & 0x7FU) != 0U;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8X2_E5M2_STRUCT C++ struct for handling vector type of two fp8 values of e5m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+ * \brief __nv_fp8x2_e5m2 datatype
+ *
+ * \details This structure implements the datatype for handling two
+ * \p fp8 floating-point numbers of \p e5m2 kind each:
+ * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp8x2_e5m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Storage variable contains the vector of two \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x2_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x2_e5m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __half2 f) {
+        __x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f),
+                                         __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const float2 f) {
+        __x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const double2 f) {
+        __x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
+        return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
+        return __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2));
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+__CUDA_HOSTDEVICE_FP8_DECL__ unsigned int
+__internal_pack_u16x2_to_u32(const unsigned short int src_lo,
+                             const unsigned short int src_hi) {
+    unsigned int dst;
+#if (defined __CUDACC__) && (defined __CUDA_ARCH__)
+    asm("{  mov.b32 %0, {%1,%2};}\n" : "=r"(dst) : "h"(src_lo), "h"(src_hi));
+#else
+    dst = (static_cast<unsigned int>(src_hi) << 16U) |
+          static_cast<unsigned int>(src_lo);
+#endif
+    return dst;
+}
+
+/**
+ * \defgroup CUDA_MATH_FP8X4_E5M2_STRUCT C++ struct for handling vector type of four fp8 values of e5m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+ * \brief __nv_fp8x4_e5m2 datatype
+ *
+ * \details This structure implements the datatype for handling four
+ * \p fp8 floating-point numbers of \p e5m2 kind each:
+ * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(4) __nv_fp8x4_e5m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Storage variable contains the vector of four \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x4_e5m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
+        const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x);
+        const __nv_fp8x2_storage_t shi =
+            static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
+        float2 rlo = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(slo, __NV_E5M2));
+        float2 rhi = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(shi, __NV_E5M2));
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8_E4M3_STRUCT C++ struct for handling fp8 data type of e4m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+ * \brief __nv_fp8_e4m3 datatype
+ *
+ * \details This structure implements the datatype for storing
+ * \p fp8 floating-point numbers of \p e4m3 kind:
+ * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
+ * The encoding doesn't support Infinity.
+ * NaNs are limited to 0x7F and 0xFF values.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp8_e4m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Storage variable contains the \p fp8 floating-point data.
+     */
+    __nv_fp8_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8_e4m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __half f) {
+        __x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f),
+                                      __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f),
+                                          __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const float f) {
+        __x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const double f) {
+        __x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E4M3);
+    }
+
+    /* Converts from integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e4m3(const unsigned short int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const unsigned int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const unsigned long int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e4m3(const unsigned long long int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const short int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const long int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p long \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const long long int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
+        return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E4M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
+        return __internal_halfraw_to_float(
+            __nv_cvt_fp8_to_halfraw(__x, __NV_E4M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
+        return __float2bfloat16_rz(float(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
+        return static_cast<double>(float(*this));
+    }
+
+    /* Convert to integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+        const unsigned char max_val = 0xFFU;
+        const unsigned char min_val = 0U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) == 0x7FU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps negative inputs to zero.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
+        return __half2ushort_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps negative inputs to zero.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
+        return __half2uint_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p long \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero if output type is 32-bit.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long int() const {
+        unsigned long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<unsigned long>(__half2ull_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<unsigned long>(__half2uint_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps negative inputs to zero.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
+        return __half2ull_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p signed \p char data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
+        signed char i;
+        const float f = float(*this);
+        const signed char max_val = (signed char)0x7FU;
+        const signed char min_val = (signed char)0x80U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) == 0x7FU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<signed char>(f);
+        }
+        return i;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * 
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+
+     * Clamps inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator char() const {
+        char value;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            value = static_cast<char>(static_cast<signed char>(*this));
+        }
+        else
+        {
+            value = static_cast<char>(static_cast<unsigned char>(*this));
+        }
+        return value;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p short \p int data type.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
+        return __half2short_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p int data type.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
+        return __half2int_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero if output type is 32-bit.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long int() const {
+        long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<long>(__half2ll_rz(__half(*this)));
+        }
+        else
+        {
+            retval = static_cast<long>(__half2int_rz(__half(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p long \p long \p int data type.
+     * \p NaN inputs convert to \p 0x8000000000000000LL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
+        return __half2ll_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
+        return (__x & 0x7FU) != 0U;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8X2_E4M3_STRUCT C++ struct for handling vector type of two fp8 values of e4m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+ * \brief __nv_fp8x2_e4m3 datatype
+ *
+ * \details This structure implements the datatype for storage
+ * and operations on the vector of two \p fp8 values of \p e4m3 kind each:
+ * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
+ * The encoding doesn't support Infinity.
+ * NaNs are limited to 0x7F and 0xFF values.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp8x2_e4m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Storage variable contains the vector of two \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x2_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x2_e4m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __half2 f) {
+        __x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f),
+                                         __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const float2 f) {
+        __x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const double2 f) {
+        __x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
+        return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
+        return __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3));
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8X4_E4M3_STRUCT C++ struct for handling vector type of four fp8 values of e4m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+ * \brief __nv_fp8x4_e4m3 datatype
+ *
+ * \details This structure implements the datatype for storage
+ * and operations on the vector of four \p fp8 values of \p e4m3 kind each:
+ * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
+ * The encoding doesn't support Infinity.
+ * NaNs are limited to 0x7F and 0xFF values.
+ */
+struct __CUDA_ALIGN__(4) __nv_fp8x4_e4m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Storage variable contains the vector of four \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x4_e4m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
+        const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x);
+        const __nv_fp8x2_storage_t shi =
+            static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
+        float2 rlo = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(slo, __NV_E4M3));
+        float2 rhi = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(shi, __NV_E4M3));
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \ingroup CUDA_MATH_FP8_E8M0_STRUCT
+ * \brief __nv_fp8_e8m0 datatype
+ *
+ * \details This structure implements the datatype for handling
+ * 8-bit scale factors of \p e8m0 kind: interpreted as powers of two
+ * with biased exponent. Bias equals to 127, so numbers 0 through 254
+ * represent 2^-127 through 2^127. Number \p 0xFF = 255 is reserved
+ * for NaN.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp8_e8m0 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8_E8M0_STRUCT
+     * Storage variable contains the 8-bit scale data.
+     */
+    __nv_fp8_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8_e8m0() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for large input values and \p cudaRoundZero for
+     * rounding.
+     * \see __nv_cvt_float_to_e8m0 for further details
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const __half f) {
+        __x = __nv_cvt_float_to_e8m0(__internal_halfraw_to_float(static_cast<__half_raw>(f)),
+                                     __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for large input values and \p cudaRoundZero for
+     * rounding.
+     * \see __nv_cvt_bfloat16raw_to_e8m0 for further details
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_e8m0(static_cast<__nv_bfloat16_raw>(f),
+                                           __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
+     * behavior for large input values and \p cudaRoundZero for
+     * rounding.
+     * \see __nv_cvt_float_to_e8m0 for further details
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const float f) {
+        __x = __nv_cvt_float_to_e8m0(f, __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for large input values and \p cudaRoundZero for
+     * rounding.
+     * \see __nv_cvt_double_to_e8m0 for further details
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const double f) {
+        __x = __nv_cvt_double_to_e8m0(f, __NV_SATFINITE, cudaRoundZero);
+    }
+
+    /* Converts from integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p short \p int data type, relies on
+     * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e8m0(const unsigned short int val) {
+        __x = static_cast<__nv_fp8_e8m0>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p int data type, relies on
+     * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const unsigned int val) {
+        __x = static_cast<__nv_fp8_e8m0>(static_cast<double>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e8m0(const unsigned long long int val) {
+        __nv_bfloat16 rn = __ull2bfloat16_rn(val);
+        __nv_bfloat16_raw rn_raw = static_cast<__nv_bfloat16_raw>(rn);
+        unsigned long long int back_int = __bfloat162ull_rz(rn);
+        if (back_int > val)
+        {
+            rn_raw.x--;
+        }
+        __x = __nv_cvt_bfloat16raw_to_e8m0(rn_raw, __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p unsigned \p long \p int data type, relies on
+     * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const unsigned long int val) {
+        __x = static_cast<__nv_fp8_e8m0>(static_cast<unsigned long long int>(val)).__x;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p short \p int data type, relies on
+     * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const short int val) {
+        __x = static_cast<__nv_fp8_e8m0>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p int data type, relies on
+    * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const int val) {
+        __x = static_cast<__nv_fp8_e8m0>(static_cast<double>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p long \p long \p int data type, relies on
+     * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const long long int val) {
+        __nv_bfloat16 rn = __ll2bfloat16_rn(val);
+        __nv_bfloat16_raw rn_raw = static_cast<__nv_bfloat16_raw>(rn);
+        long long int back_int = __bfloat162ll_rz(rn);
+        if (((val > 0) && (back_int > val)) || ((val < 0) && (back_int < val)))
+        {
+            rn_raw.x--;
+        }
+        __x = __nv_cvt_bfloat16raw_to_e8m0(rn_raw, __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p long \p int data type, relies on
+     * \p cudaRoundZero rounding.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e8m0(const long int val) {
+        __x = static_cast<__nv_fp8_e8m0>(static_cast<long long int>(val)).__x;
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
+        return __internal_bf16raw_to_float(__nv_cvt_e8m0_to_bf16raw((*this).__x));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
+        return static_cast<__nv_bfloat16>(
+            __nv_cvt_e8m0_to_bf16raw((*this).__x));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
+        const float f = float(*this);
+        double d;
+#if (defined __CUDA_ARCH__)
+        // protect from ftz in device code
+        asm("{  cvt.f64.f32 %0, %1;}\n" : "=d"(d) : "f"(f));
+#else
+        d = static_cast<double>(f);
+#endif
+        return d;
+    }
+
+    /* rounding conversion to half */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
+        return __float2half_rn(float(*this));
+    }
+
+    /* Convert to integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+        const unsigned char max_val = 0xFFU;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if (bits == 0xFFU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
+        return __bfloat162ushort_rz(__nv_bfloat16(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
+        return __bfloat162uint_rz(__nv_bfloat16(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero if output type is 32-bit.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long int() const {
+        unsigned long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(unsigned long) == sizeof(unsigned long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<unsigned long>(__bfloat162ull_rz(__nv_bfloat16(*this)));
+        }
+        else
+        {
+            retval = static_cast<unsigned long>(__bfloat162uint_rz(__nv_bfloat16(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
+        return __bfloat162ull_rz(__nv_bfloat16(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p signed \p char data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
+        signed char i;
+        const float f = float(*this);
+        const signed char max_val = (signed char)0x7FU;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if (bits == 0xFFU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else {
+            // normal value
+            i = static_cast<signed char>(f);
+        }
+        return i;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to an implementation defined \p char data type.
+     * 
+     * Detects signedness of the \p char type and proceeds accordingly, see
+     * further details in signed and unsigned char operators.
+
+     * Clamps inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator char() const {
+        char value;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (((char)-1) < (char)0)
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            value = static_cast<char>(static_cast<signed char>(*this));
+        }
+        else
+        {
+            value = static_cast<char>(static_cast<unsigned char>(*this));
+        }
+        return value;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p short \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
+        return __bfloat162short_rz(__nv_bfloat16(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
+        return __bfloat162int_rz(__nv_bfloat16(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero if output type is 32-bit.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL if output type is 64-bit.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long int() const {
+        long retval;
+        /* Suppress VS warning: warning C4127: conditional expression is constant */
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (push)
+#pragma warning (disable: 4127)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        if (sizeof(long) == sizeof(long long))
+#if defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#pragma warning (pop)
+#endif /* _MSC_VER && !defined(__CUDA_ARCH__) */
+        {
+            retval = static_cast<long>(__bfloat162ll_rz(__nv_bfloat16(*this)));
+        }
+        else
+        {
+            retval = static_cast<long>(__bfloat162int_rz(__nv_bfloat16(*this)));
+        }
+        return retval;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p long \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p 0x8000000000000000LL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
+        return __bfloat162ll_rz(__nv_bfloat16(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p bool data type.
+     * All values in input range are non-zero, so result is always \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
+        return true;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8X2_E8M0_STRUCT C++ struct for handling vector type of two scale factors of e8m0 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X2_E8M0_STRUCT
+ * \brief __nv_fp8x2_e8m0 datatype
+ *
+ * \details This structure implements the datatype for storage
+ * and operations on the vector of two scale factors of \p e8m0 kind each.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp8x2_e8m0 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E8M0_STRUCT
+     * Storage variable contains the vector of two scale factor
+     * values.
+     */
+    __nv_fp8x2_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x2_e8m0() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e8m0() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e8m0(const __half2 f) {
+        __x = __nv_cvt_float2_to_e8m0x2(__half22float2(f),
+                                __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e8m0(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat162raw_to_e8m0x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e8m0(const float2 f) {
+        __x = __nv_cvt_float2_to_e8m0x2(f,
+                                __NV_SATFINITE, cudaRoundZero);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e8m0(const double2 f) {
+        __x = __nv_cvt_double2_to_e8m0x2(f, __NV_SATFINITE, cudaRoundZero);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __nv_bfloat162 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat162() const {
+        return static_cast<__nv_bfloat162>(__nv_cvt_e8m0x2_to_bf162raw((*this).__x));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
+        return __bfloat1622float2(static_cast<__nv_bfloat162>(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
+        return __float22half2_rn(static_cast<float2>(*this));
+    }
+
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8X4_E8M0_STRUCT C++ struct for handling vector type of four scale factors of e8m0 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X4_E8M0_STRUCT
+ * \brief __nv_fp8x4_e8m0 datatype
+ *
+ * \details This structure implements the datatype for storage
+ * and operations on the vector of scale factors of \p e8m0 kind each.
+ */
+struct __CUDA_ALIGN__(4) __nv_fp8x4_e8m0 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E8M0_STRUCT
+     * Storage variable contains the vector of four scale factor
+     * values.
+     */
+    __nv_fp8x4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x4_e8m0() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e8m0() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e8m0(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp8x2_storage_t rlo = static_cast<__nv_fp8x2_e8m0>(flo).__x;
+        const __nv_fp8x2_storage_t rhi = static_cast<__nv_fp8x2_e8m0>(fhi).__x;
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e8m0(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp8x2_storage_t rlo = static_cast<__nv_fp8x2_e8m0>(flo).__x;
+        const __nv_fp8x2_storage_t rhi = static_cast<__nv_fp8x2_e8m0>(fhi).__x;
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e8m0(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_float2_to_e8m0x2(flo, __NV_SATFINITE, cudaRoundZero);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_float2_to_e8m0x2(fhi, __NV_SATFINITE, cudaRoundZero);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e8m0(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_double2_to_e8m0x2(flo, __NV_SATFINITE, cudaRoundZero);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_double2_to_e8m0x2(fhi, __NV_SATFINITE, cudaRoundZero);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_MISC
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
+        __nv_fp8x2_e8m0 lo;
+        lo.__x = static_cast<__nv_fp8x2_storage_t>(__x);
+        __nv_fp8x2_e8m0 hi;
+        hi.__x = static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
+
+        float2 rlo = static_cast<float2>(lo);
+        float2 rhi = static_cast<float2>(hi);
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+#endif /* defined(__cplusplus) */
+
+#endif /* end of include guard: __CUDA_FP8_HPP__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_gl_interop.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_gl_interop.h
new file mode 100644
index 0000000000000000000000000000000000000000..df64a8afa14f695bb05810266ac40b227c078cc5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_gl_interop.h
@@ -0,0 +1,514 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_GL_INTEROP_H__)
+#define __CUDA_GL_INTEROP_H__
+
+#include "cuda_runtime_api.h"
+
+#if defined(__APPLE__)
+
+#include <OpenGL/gl.h>
+
+#else /* __APPLE__ */
+
+#if defined(__arm__) || defined(__aarch64__)
+#ifndef GL_VERSION
+#error Please include the appropriate gl headers before including cuda_gl_interop.h
+#endif
+#else
+#include <GL/gl.h>
+#endif
+
+#endif /* __APPLE__ */
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \addtogroup CUDART_OPENGL OpenGL Interoperability
+ * This section describes the OpenGL interoperability functions of the CUDA
+ * runtime application programming interface. Note that mapping of OpenGL
+ * resources is performed with the graphics API agnostic, resource mapping 
+ * interface described in \ref CUDART_INTEROP "Graphics Interopability".
+ *
+ * @{
+ */
+
+/**
+ * CUDA devices corresponding to the current OpenGL context
+ */
+enum cudaGLDeviceList
+{
+  cudaGLDeviceListAll           = 1, /**< The CUDA devices for all GPUs used by the current OpenGL context */
+  cudaGLDeviceListCurrentFrame  = 2, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
+  cudaGLDeviceListNextFrame     = 3  /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame  */
+};
+
+/**
+ * \brief Gets the CUDA devices associated with the current OpenGL context
+ *
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices 
+ * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices 
+ * at most \p cudaDeviceCount of the CUDA-compatible devices corresponding to 
+ * the current OpenGL context. If any of the GPUs being used by the current OpenGL
+ * context are not CUDA capable then the call will return ::cudaErrorNoDevice.
+ *
+ * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to the 
+ *                           current OpenGL context
+ * \param pCudaDevices     - Returned CUDA devices corresponding to the current 
+ *                           OpenGL context
+ * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
+ * \param deviceList       - The set of devices to return.  This set may be
+ *                           ::cudaGLDeviceListAll for all devices, 
+ *                           ::cudaGLDeviceListCurrentFrame for the devices used to
+ *                           render the current frame (in SLI), or
+ *                           ::cudaGLDeviceListNextFrame for the devices used to
+ *                           render the next frame (in SLI).
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNoDevice,
+ * ::cudaErrorInvalidGraphicsContext,
+ * ::cudaErrorOperatingSystem,
+ * ::cudaErrorUnknown
+ *
+ * \note This function is not supported on Mac OS X.
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources, 
+ * ::cudaGraphicsSubResourceGetMappedArray, 
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGLGetDevices 
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGLGetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, enum cudaGLDeviceList deviceList);
+
+/**
+ * \brief Register an OpenGL texture or renderbuffer object
+ *
+ * Registers the texture or renderbuffer object specified by \p image for access by CUDA.
+ * A handle to the registered object is returned as \p resource.
+ *
+ * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D, 
+ * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY, 
+ * or ::GL_RENDERBUFFER.
+ *
+ * The register flags \p flags specify the intended usage, as follows: 
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
+ *   bind this resource to a surface reference.
+ * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
+ *   texture gather operations on this resource.
+ *
+ * The following image formats are supported. For brevity's sake, the list is abbreviated.
+ * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats 
+ * {GL_R8, GL_R16, GL_RG8, GL_RG16} :
+ * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
+ * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
+ * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
+ * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
+ *
+ * The following image classes are currently disallowed:
+ * - Textures with borders
+ * - Multisampled renderbuffers
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param image    - name of texture or renderbuffer object to be registered
+ * \param target   - Identifies the type of object specified by \p image 
+ * \param flags    - Register flags
+ * 
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorOperatingSystem,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources, 
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsGLRegisterImage
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterImage(struct cudaGraphicsResource **resource, GLuint image, GLenum target, unsigned int flags);
+
+/**
+ * \brief Registers an OpenGL buffer object
+ *
+ * Registers the buffer object specified by \p buffer for access by
+ * CUDA.  A handle to the registered object is returned as \p
+ * resource.  The register flags \p flags specify the intended usage,
+ * as follows:
+ *
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param buffer   - name of buffer object to be registered
+ * \param flags    - Register flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorOperatingSystem,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources,
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsGLRegisterBuffer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterBuffer(struct cudaGraphicsResource **resource, GLuint buffer, unsigned int flags);
+
+#ifdef _WIN32
+#ifndef WGL_NV_gpu_affinity
+typedef void* HGPUNV;
+#endif
+
+/**
+ * \brief Gets the CUDA device associated with hGpu
+ *
+ * Returns the CUDA device associated with a hGpu, if applicable.
+ *
+ * \param device - Returns the device associated with hGpu, or -1 if hGpu is
+ * not a compute device.
+ * \param hGpu   - Handle to a GPU, as queried via WGL_NV_gpu_affinity
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ *
+ * \sa
+ * ::WGL_NV_gpu_affinity,
+ * ::cuWGLGetDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu);
+#endif
+
+/** @} */ /* END CUDART_OPENGL */
+
+/**
+ * \addtogroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED]
+ * This section describes deprecated OpenGL interoperability functionality.
+ *
+ * @{
+ */
+
+/**
+ * CUDA GL Map Flags
+ */
+enum cudaGLMapFlags
+{
+  cudaGLMapFlagsNone         = 0,  /**< Default; Assume resource can be read/written */
+  cudaGLMapFlagsReadOnly     = 1,  /**< CUDA kernels will not write to this resource */
+  cudaGLMapFlagsWriteDiscard = 2   /**< CUDA kernels will only write to and will not read from this resource */
+};
+
+/**
+ * \brief Sets a CUDA device to use OpenGL interoperability
+ *
+ * \deprecated This function is deprecated as of CUDA 5.0. 
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA device with an OpenGL
+ * context in order to achieve maximum interoperability performance.
+ *
+ * This function will immediately initialize the primary context on 
+ * \p device if needed.
+ *
+ * \param device - Device to use for OpenGL interoperability
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorSetOnActiveProcess
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsGLRegisterBuffer, ::cudaGraphicsGLRegisterImage
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
+
+/**
+ * \brief Registers a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Registers the buffer object of ID \p bufObj for access by
+ * CUDA. This function must be called before CUDA can map the buffer
+ * object.  The OpenGL context used to create the buffer, or another
+ * context from the same share group, must be bound to the current
+ * thread when this is called.
+ *
+ * \param bufObj - Buffer object ID to register
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInitializationError
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsGLRegisterBuffer
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint bufObj);
+
+/**
+ * \brief Maps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Maps the buffer object of ID \p bufObj into the address space of
+ * CUDA and returns in \p *devPtr the base pointer of the resulting
+ * mapping.  The buffer must have previously been registered by
+ * calling ::cudaGLRegisterBufferObject().  While a buffer is mapped
+ * by CUDA, any OpenGL operation which references the buffer will
+ * result in undefined behavior.  The OpenGL context used to create
+ * the buffer, or another context from the same share group, must be
+ * bound to the current thread when this is called.
+ *
+ * All streams in the current thread are synchronized with the current
+ * GL context.
+ *
+ * \param devPtr - Returned device pointer to CUDA object
+ * \param bufObj - Buffer object ID to map
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsMapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
+
+/**
+ * \brief Unmaps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Unmaps the buffer object of ID \p bufObj for access by CUDA.  When
+ * a buffer is unmapped, the base address returned by
+ * ::cudaGLMapBufferObject() is invalid and subsequent references to
+ * the address result in undefined behavior.  The OpenGL context used
+ * to create the buffer, or another context from the same share group,
+ * must be bound to the current thread when this is called.
+ *
+ * All streams in the current thread are synchronized with the current
+ * GL context.
+ *
+ * \param bufObj - Buffer object to unmap
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnmapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnmapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj);
+
+/**
+ * \brief Unregisters a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Unregisters the buffer object of ID \p bufObj for access by CUDA
+ * and releases any CUDA resources associated with the buffer.  Once a
+ * buffer is unregistered, it may no longer be mapped by CUDA.  The GL
+ * context used to create the buffer, or another context from the
+ * same share group, must be bound to the current thread when this is
+ * called.
+ *
+ * \param bufObj - Buffer object to unregister
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnregisterResource
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint bufObj);
+
+/**
+ * \brief Set usage flags for mapping an OpenGL buffer
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Set flags for mapping the OpenGL buffer \p bufObj
+ *
+ * Changes to flags will take effect the next time \p bufObj is mapped.
+ * The \p flags argument may be any of the following:
+ *
+ * - ::cudaGLMapFlagsNone: Specifies no hints about how this buffer will
+ * be used. It is therefore assumed that this buffer will be read from and
+ * written to by CUDA kernels. This is the default value.
+ * - ::cudaGLMapFlagsReadOnly: Specifies that CUDA kernels which access this
+ * buffer will not write to the buffer.
+ * - ::cudaGLMapFlagsWriteDiscard: Specifies that CUDA kernels which access
+ * this buffer will not read from the buffer and will write over the
+ * entire contents of the buffer, so none of the data previously stored in
+ * the buffer will be preserved.
+ *
+ * If \p bufObj has not been registered for use with CUDA, then
+ * ::cudaErrorInvalidResourceHandle is returned. If \p bufObj is presently
+ * mapped for access by CUDA, then ::cudaErrorUnknown is returned.
+ *
+ * \param bufObj    - Registered buffer object to set flags for
+ * \param flags     - Parameters for buffer mapping
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsResourceSetMapFlags
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags); 
+
+/**
+ * \brief Maps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Maps the buffer object of ID \p bufObj into the address space of
+ * CUDA and returns in \p *devPtr the base pointer of the resulting
+ * mapping.  The buffer must have previously been registered by
+ * calling ::cudaGLRegisterBufferObject().  While a buffer is mapped
+ * by CUDA, any OpenGL operation which references the buffer will
+ * result in undefined behavior.  The OpenGL context used to create
+ * the buffer, or another context from the same share group, must be
+ * bound to the current thread when this is called.
+ *
+ * Stream /p stream is synchronized with the current GL context.
+ *
+ * \param devPtr - Returned device pointer to CUDA object
+ * \param bufObj - Buffer object ID to map
+ * \param stream - Stream to synchronize
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsMapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **devPtr, GLuint bufObj, cudaStream_t stream);
+
+/**
+ * \brief Unmaps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Unmaps the buffer object of ID \p bufObj for access by CUDA.  When
+ * a buffer is unmapped, the base address returned by
+ * ::cudaGLMapBufferObject() is invalid and subsequent references to
+ * the address result in undefined behavior.  The OpenGL context used
+ * to create the buffer, or another context from the same share group,
+ * must be bound to the current thread when this is called.
+ *
+ * Stream /p stream is synchronized with the current GL context.
+ *
+ * \param bufObj - Buffer object to unmap
+ * \param stream - Stream to synchronize
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnmapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnmapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint bufObj, cudaStream_t stream);
+
+/** @} */ /* END CUDART_OPENGL_DEPRECATED */
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#undef __CUDA_DEPRECATED
+
+#endif /* __CUDA_GL_INTEROP_H__ */
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_stdint.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_stdint.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a9814410e4b6fb4f07ad9edc8394e956b77dbcd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_stdint.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2009-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __cuda_stdint_h__
+#define __cuda_stdint_h__
+
+// Compiler-specific treatment for C99's stdint.h
+//
+// By default, this header will use the standard headers (so it
+// is your responsibility to make sure they are available), except
+// on MSVC before Visual Studio 2010, when they were not provided.
+// To support old MSVC, a few of the commonly-used definitions are
+// provided here.  If more definitions are needed, add them here,
+// or replace these definitions with a complete implementation,
+// such as the ones available from Google, Boost, or MSVC10.  You
+// can prevent the definition of any of these types (in order to
+// use your own) by #defining CU_STDINT_TYPES_ALREADY_DEFINED.
+
+#if !defined(CU_STDINT_TYPES_ALREADY_DEFINED)
+
+// In VS including stdint.h forces the C++ runtime dep - provide an opt-out
+// (CU_STDINT_VS_FORCE_NO_STDINT_H) for users that care (notably static
+// cudart).
+#if defined(_MSC_VER) && ((_MSC_VER < 1600) || defined(CU_STDINT_VS_FORCE_NO_STDINT_H))
+
+// These definitions can be used with MSVC 8 and 9,
+// which don't ship with stdint.h:
+
+typedef unsigned   char   uint8_t;
+
+typedef            short  int16_t;
+typedef unsigned   short uint16_t;
+
+// To keep it consistent with all MSVC build. define those types
+// in the exact same way they are defined with the MSVC headers
+#if defined(_MSC_VER)
+typedef signed     char    int8_t;
+
+typedef            int     int32_t;
+typedef unsigned   int     uint32_t;
+
+typedef long long          int64_t;
+typedef unsigned long long uint64_t;
+#else
+typedef            char    int8_t;
+
+typedef            long   int32_t;
+typedef unsigned   long  uint32_t;
+
+typedef          __int64  int64_t;
+typedef unsigned __int64 uint64_t;
+#endif
+
+#elif defined(__DJGPP__)
+
+// These definitions can be used when compiling
+// C code with DJGPP, which only provides stdint.h
+// when compiling C++ code with TR1 enabled.
+
+typedef               char    int8_t;
+typedef unsigned      char   uint8_t;
+
+typedef               short  int16_t;
+typedef unsigned      short uint16_t;
+
+typedef               long   int32_t;
+typedef unsigned      long  uint32_t;
+
+typedef          long long   int64_t;
+typedef unsigned long long  uint64_t;
+
+#else
+
+// Use standard headers, as specified by C99 and C++ TR1.
+// Known to be provided by:
+// - gcc/glibc, supported by all versions of glibc
+// - djgpp, supported since 2001
+// - MSVC, supported by Visual Studio 2010 and later
+
+#include <stdint.h>
+
+#endif
+
+#endif // !defined(CU_STDINT_TYPES_ALREADY_DEFINED)
+
+
+#endif // file guard
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_surface_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_surface_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a35c215668e98006c3eaa286deb70461eb1fa62
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_surface_types.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_SURFACE_TYPES_H__)
+#define __CUDA_SURFACE_TYPES_H__
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "channel_descriptor.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__CUDA_SURFACE_TYPES_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_texture_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_texture_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f723db5c682a7b4b05491219df8993f0f6ebd59
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_texture_types.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_TEXTURE_TYPES_H__)
+#define __CUDA_TEXTURE_TYPES_H__
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "channel_descriptor.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__CUDA_TEXTURE_TYPES_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_vdpau_interop.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_vdpau_interop.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cf1ba357eb02ed82afc2f1812627a8a2d88c6f7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_vdpau_interop.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_VDPAU_INTEROP_H__)
+#define __CUDA_VDPAU_INTEROP_H__
+
+#include "cuda_runtime_api.h"
+
+#include <vdpau/vdpau.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \addtogroup CUDART_VDPAU VDPAU Interoperability
+ * This section describes the VDPAU interoperability functions of the CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Gets the CUDA device associated with a VdpDevice.
+ *
+ * Returns the CUDA device associated with a VdpDevice, if applicable.
+ *
+ * \param device - Returns the device associated with vdpDevice, or -1 if
+ * the device associated with vdpDevice is not a compute device.
+ * \param vdpDevice - A VdpDevice handle
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaVDPAUSetVDPAUDevice,
+ * ::cuVDPAUGetDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaVDPAUGetDevice(int *device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+
+/**
+ * \brief Sets a CUDA device to use VDPAU interoperability
+ *
+ * Records \p vdpDevice as the VdpDevice for VDPAU interoperability 
+ * with the CUDA device \p device and sets \p device as the current 
+ * device for the calling host thread.
+ *
+ * This function will immediately initialize the primary context on 
+ * \p device if needed.
+ *
+ * If \p device has already been initialized then this call will fail 
+ * with the error ::cudaErrorSetOnActiveProcess.  In this case it is 
+ * necessary to reset \p device using ::cudaDeviceReset() before 
+ * VDPAU interoperability on \p device may be enabled.
+ *
+ * \param device - Device to use for VDPAU interoperability
+ * \param vdpDevice - The VdpDevice to interoperate with
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorSetOnActiveProcess
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsVDPAURegisterVideoSurface,
+ * ::cudaGraphicsVDPAURegisterOutputSurface,
+ * ::cudaDeviceReset
+ */
+extern __host__ cudaError_t CUDARTAPI cudaVDPAUSetVDPAUDevice(int device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+
+/**
+ * \brief Register a VdpVideoSurface object
+ *
+ * Registers the VdpVideoSurface specified by \p vdpSurface for access by CUDA.
+ * A handle to the registered object is returned as \p resource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param vdpSurface - VDPAU object to be registered
+ * \param flags - Map flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaVDPAUSetVDPAUDevice,
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsVDPAURegisterVideoSurface
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsVDPAURegisterVideoSurface(struct cudaGraphicsResource **resource, VdpVideoSurface vdpSurface, unsigned int flags);
+
+/**
+ * \brief Register a VdpOutputSurface object
+ *
+ * Registers the VdpOutputSurface specified by \p vdpSurface for access by CUDA.
+ * A handle to the registered object is returned as \p resource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param vdpSurface - VDPAU object to be registered
+ * \param flags - Map flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaVDPAUSetVDPAUDevice,
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsVDPAURegisterOutputSurface
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsVDPAURegisterOutputSurface(struct cudaGraphicsResource **resource, VdpOutputSurface vdpSurface, unsigned int flags);
+
+/** @} */ /* END CUDART_VDPAU */
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif /* __CUDA_VDPAU_INTEROP_H__ */
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudart_platform.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudart_platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f022bbe349eba2219a6b74f1ea315c1ce8551b7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudart_platform.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef __CUDART_PLATFORM_H__
+#define __CUDART_PLATFORM_H__
+
+#if ((defined(__linux__) || defined(__QNX__)) && (defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)))
+#define isEglSupported 1
+#endif
+
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_activity.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_activity.h
new file mode 100644
index 0000000000000000000000000000000000000000..cdb6b76f8d66e986b20bd481fbeb0a12a791e5a5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_activity.h
@@ -0,0 +1,8065 @@
+/*
+ * Copyright 2011-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_ACTIVITY_H_)
+#define _CUPTI_ACTIVITY_H_
+
+/**
+ * Deprecated APIs and structures have been moved to the
+ * header :doc: `cupti_activity_deprecated.h`, which is included at
+ * the bottom of this file. Header cupti_activity.h contains
+ * only the latest version of APIs and structures.
+ */
+
+#include <cuda.h>
+#include <cupti_callbacks.h>
+#include <cupti_events.h>
+#include <cupti_metrics.h>
+#include <cupti_result.h>
+
+#if defined(CUPTI_DIRECTIVE_SUPPORT)
+#include <Openacc/cupti_openacc.h>
+#include <Openmp/cupti_openmp.h>
+#endif
+
+#include <cupti_common.h>
+
+#define CUPTI_UNIFIED_MEMORY_CPU_DEVICE_ID ((uint32_t) 0xFFFFFFFFU)
+#define CUPTI_INVALID_CONTEXT_ID ((uint32_t) 0xFFFFFFFFU)
+#define CUPTI_INVALID_STREAM_ID ((uint32_t) 0xFFFFFFFFU)
+#define CUPTI_INVALID_CHANNEL_ID ((uint32_t) 0xFFFFFFFFU)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+#define invalidNumaId ((uint32_t) 0xFFFFFFFF)
+
+/**
+ * \defgroup CUPTI_ACTIVITY_API CUPTI Activity API
+ * Functions, types, and enums that implement the CUPTI Activity API.
+ * @{
+ */
+
+/**
+ * \brief The kinds of activity records.
+ *
+ * Each activity record kind represents information about a GPU or an
+ * activity occurring on a CPU or GPU. Each kind is associated with a
+ * activity record structure that holds the information associated
+ * with the kind.
+ * \see CUpti_Activity
+ * \see CUpti_ActivityAPI
+ * \see CUpti_ActivityContext
+ * \see CUpti_ActivityContext2
+ * \see CUpti_ActivityContext3
+ * \see CUpti_ActivityDevice
+ * \see CUpti_ActivityDevice2
+ * \see CUpti_ActivityDevice3
+ * \see CUpti_ActivityDevice4
+ * \see CUpti_ActivityDeviceAttribute
+ * \see CUpti_ActivityEvent
+ * \see CUpti_ActivityEventInstance
+ * \see CUpti_ActivityKernel
+ * \see CUpti_ActivityKernel2
+ * \see CUpti_ActivityKernel3
+ * \see CUpti_ActivityKernel4
+ * \see CUpti_ActivityKernel5
+ * \see CUpti_ActivityKernel6
+ * \see CUpti_ActivityKernel7
+ * \see CUpti_ActivityKernel8
+ * \see CUpti_ActivityKernel9
+ * \see CUpti_ActivityCdpKernel
+ * \see CUpti_ActivityPreemption
+ * \see CUpti_ActivityMemcpy
+ * \see CUpti_ActivityMemcpy3
+ * \see CUpti_ActivityMemcpy4
+ * \see CUpti_ActivityMemcpy5
+ * \see CUpti_ActivityMemcpy6
+ * \see CUpti_ActivityMemcpyPtoP
+ * \see CUpti_ActivityMemcpyPtoP2
+ * \see CUpti_ActivityMemcpyPtoP3
+ * \see CUpti_ActivityMemcpyPtoP4
+ * \see CUpti_ActivityMemset
+ * \see CUpti_ActivityMemset2
+ * \see CUpti_ActivityMemset3
+ * \see CUpti_ActivityMemset4
+ * \see CUpti_ActivityMemory
+ * \see CUpti_ActivityMemory2
+ * \see CUpti_ActivityMemory3
+ * \see CUpti_ActivityMemory4
+ * \see CUpti_ActivityMemoryPool
+ * \see CUpti_ActivityMemoryPool2
+ * \see CUpti_ActivityMetric
+ * \see CUpti_ActivityMetricInstance
+ * \see CUpti_ActivityName
+ * \see CUpti_ActivityMarker
+ * \see CUpti_ActivityMarker2
+ * \see CUpti_ActivityMarkerData
+ * \see CUpti_ActivitySourceLocator
+ * \see CUpti_ActivityGlobalAccess
+ * \see CUpti_ActivityGlobalAccess2
+ * \see CUpti_ActivityGlobalAccess3
+ * \see CUpti_ActivityBranch
+ * \see CUpti_ActivityBranch2
+ * \see CUpti_ActivityOverhead3
+ * \see CUpti_ActivityEnvironment
+ * \see CUpti_ActivityInstructionExecution
+ * \see CUpti_ActivityUnifiedMemoryCounter
+ * \see CUpti_ActivityFunction
+ * \see CUpti_ActivityModule
+ * \see CUpti_ActivitySharedAccess
+ * \see CUpti_ActivityPCSampling
+ * \see CUpti_ActivityPCSampling2
+ * \see CUpti_ActivityPCSampling3
+ * \see CUpti_ActivityPCSamplingRecordInfo
+ * \see CUpti_ActivityCudaEvent2
+ * \see CUpti_ActivityStream
+ * \see CUpti_ActivitySynchronization2
+ * \see CUpti_ActivityInstructionCorrelation
+ * \see CUpti_ActivityExternalCorrelation
+ * \see CUpti_ActivityUnifiedMemoryCounter3
+ * \see CUpti_ActivityOpenAccData
+ * \see CUpti_ActivityOpenAccLaunch
+ * \see CUpti_ActivityOpenAccOther
+ * \see CUpti_ActivityOpenMp
+ * \see CUpti_ActivityNvLink
+ * \see CUpti_ActivityNvLink2
+ * \see CUpti_ActivityNvLink3
+ * \see CUpti_ActivityNvLink4
+ * \see CUpti_ActivityPcie
+ * \see CUpti_ActivityConfidentialComputeRotation
+ */
+
+typedef enum {
+  /**
+   * The activity record is invalid.
+   */
+  CUPTI_ACTIVITY_KIND_INVALID  = 0,
+
+  /**
+   * A host<->host, host<->device, or device<->device memory copy.
+   * For peer to peer memory copy, use the kind CUPTI_ACTIVITY_KIND_MEMCPY2.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemcpy6.
+   */
+  CUPTI_ACTIVITY_KIND_MEMCPY   = 1,
+
+  /**
+   * A memory set executing on the GPU. The corresponding activity
+   * record structure is \ref CUpti_ActivityMemset4.
+   */
+  CUPTI_ACTIVITY_KIND_MEMSET   = 2,
+
+  /**
+   * A kernel executing on the GPU. This activity kind may significantly change
+   * the overall performance characteristics of the application because all
+   * kernel executions are serialized on the GPU. Other activity kind for kernel
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL doesn't break kernel concurrency.
+   * The corresponding activity record structure is \ref CUpti_ActivityKernel9.
+   */
+  CUPTI_ACTIVITY_KIND_KERNEL   = 3,
+
+  /**
+   * A CUDA driver API function execution. The corresponding activity
+   * record structure is \ref CUpti_ActivityAPI.
+   */
+  CUPTI_ACTIVITY_KIND_DRIVER   = 4,
+
+  /**
+   * A CUDA runtime API function execution. The corresponding activity
+   * record structure is \ref CUpti_ActivityAPI.
+   */
+  CUPTI_ACTIVITY_KIND_RUNTIME  = 5,
+
+  /**
+   * A performance counter (aka event) value. The corresponding activity record 
+   * structure is \ref CUpti_ActivityEvent. This activity cannot be directly
+   * enabled or disabled. Information collected using the Event API.
+   * can be stored in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_EVENT    = 6,
+
+  /**
+   * A performance metric value. The corresponding activity record structure is
+   * \ref CUpti_ActivityMetric. This activity cannot be directly
+   * enabled or disabled. Information collected using the Metric API.
+   * can be stored in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_METRIC   = 7,
+
+  /**
+   * Information about a CUDA device. The corresponding activity record
+   * structure is \ref CUpti_ActivityDevice5.
+   */
+  CUPTI_ACTIVITY_KIND_DEVICE   = 8,
+
+  /**
+   * Information about a CUDA context. The corresponding activity record
+   * structure is \ref CUpti_ActivityContext3.
+   */
+  CUPTI_ACTIVITY_KIND_CONTEXT  = 9,
+
+  /**
+   * A kernel executing on the GPU. This activity kind doesn't break
+   * kernel concurrency. The corresponding activity record structure
+   * is \ref CUpti_ActivityKernel9.
+   */
+  CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL = 10,
+
+  /**
+   * Resource naming done via NVTX APIs for thread, device, context, etc.
+   * The corresponding activity record structure is \ref CUpti_ActivityName.
+   */
+  CUPTI_ACTIVITY_KIND_NAME     = 11,
+
+  /**
+   * Instantaneous, start, or end NVTX marker. The corresponding activity
+   * record structure is \ref CUpti_ActivityMarker2.
+   */
+  CUPTI_ACTIVITY_KIND_MARKER = 12,
+
+  /**
+   * Extended, optional, data about a NVTX marker. User must enable
+   * CUPTI_ACTIVITY_KIND_MARKER as well to get records for marker data.
+   * The corresponding activity record structure is \ref CUpti_ActivityMarkerData.
+   */
+  CUPTI_ACTIVITY_KIND_MARKER_DATA = 13,
+
+  /**
+   * Source information about source level result. The corresponding
+   * activity record structure is \ref CUpti_ActivitySourceLocator.
+   * In CUDA 12.6, this kind is deprecated for Volta and later GPU architectures
+   * in favor of SASS Metric APIs from the header cupti_sass_metrics.h.
+   */
+  CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR = 14,
+
+  /**
+   * Results for source-level global access. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityGlobalAccess3.
+   * In CUDA 12.6, this kind is deprecated for Volta and later GPU architectures
+   * in favor of SASS Metric APIs from the header cupti_sass_metrics.h.
+   */
+  CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS = 15,
+
+  /**
+   * Results for source-level branch. The corresponding
+   * activity record structure is \ref CUpti_ActivityBranch2.
+   * In CUDA 12.6, this kind is deprecated for Volta and later GPU architectures
+   * in favor of SASS Metric APIs from the header cupti_sass_metrics.h.
+   */
+  CUPTI_ACTIVITY_KIND_BRANCH = 16,
+
+  /**
+   * Overhead added by CUPTI, Compiler, CUDA driver etc. The
+   * corresponding activity record structure is
+   * \ref CUpti_ActivityOverhead3.
+   */
+  CUPTI_ACTIVITY_KIND_OVERHEAD = 17,
+
+  /**
+   * A CDP (CUDA Dynamic Parallel) kernel executing on the GPU. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityCdpKernel. This activity cannot be directly
+   * enabled or disabled. It is enabled and disabled through
+   * concurrent kernel activity i.e. _CONCURRENT_KERNEL.
+   */
+  CUPTI_ACTIVITY_KIND_CDP_KERNEL = 18,
+  /**
+   * Preemption activity record indicating a preemption of a CDP (CUDA
+   * Dynamic Parallel) kernel executing on the GPU. The corresponding
+   * activity record structure is \ref CUpti_ActivityPreemption.
+   */
+  CUPTI_ACTIVITY_KIND_PREEMPTION = 19,
+
+  /**
+   * Environment activity records indicating power, clock, thermal,
+   * etc. levels of the GPU. The corresponding activity record
+   * structure is \ref CUpti_ActivityEnvironment.
+   */
+  CUPTI_ACTIVITY_KIND_ENVIRONMENT = 20,
+
+  /**
+   * An performance counter value associated with a specific event domain
+   * instance. The corresponding activity record structure is \ref
+   * CUpti_ActivityEventInstance. This activity cannot be directly
+   * enabled or disabled. Information collected using the Event API.
+   * can be stored in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_EVENT_INSTANCE = 21,
+
+  /**
+   * A peer to peer memory copy. The corresponding activity record
+   * structure is \ref CUpti_ActivityMemcpyPtoP4.
+   */
+  CUPTI_ACTIVITY_KIND_MEMCPY2 = 22,
+
+  /**
+   * A performance metric value associated with a specific metric domain
+   * instance. The corresponding activity record structure is \ref
+   * CUpti_ActivityMetricInstance. This activity cannot be directly
+   * enabled or disabled. Information collected using the Metric API.
+   * can be stored in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_METRIC_INSTANCE = 23,
+
+  /**
+   * Results for source-level instruction execution.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstructionExecution.
+   * In CUDA 12.6, this kind is deprecated for Volta and later GPU architectures
+   * in favor of SASS Metric APIs from the header cupti_sass_metrics.h.
+   */
+  CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION = 24,
+
+  /**
+   * Unified Memory counter record. The corresponding activity
+   * record structure is \ref CUpti_ActivityUnifiedMemoryCounter3.
+   */
+  CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER = 25,
+
+  /**
+   * Device global/function record. The corresponding activity
+   * record structure is \ref CUpti_ActivityFunction.
+   */
+  CUPTI_ACTIVITY_KIND_FUNCTION = 26,
+
+  /**
+   * CUDA Module record. The corresponding activity
+   * record structure is \ref CUpti_ActivityModule.
+   * This activity cannot be directly enabled or disabled.
+   * Information collected using the module callback can be
+   * be stored in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_MODULE = 27,
+
+  /**
+   * A device attribute value. The corresponding activity record
+   * structure is \ref CUpti_ActivityDeviceAttribute.
+   * This activity cannot be directly enabled or disabled.
+   * Information collected using attributes CUpti_DeviceAttribute
+   * or CUdevice_attribute can be stored in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE   = 28,
+
+  /**
+   * Results for source-level shared access. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivitySharedAccess.
+   * In CUDA 12.6, this kind is deprecated for Volta and later GPU architectures
+   * in favor of SASS Metric APIs from the header cupti_sass_metrics.h.
+   */
+  CUPTI_ACTIVITY_KIND_SHARED_ACCESS = 29,
+
+  /**
+   * PC sampling information for kernels. This will serialize
+   * kernels. The corresponding activity record structure
+   * is \ref CUpti_ActivityPCSampling3. In CUDA 12.5, this kind
+   * is deprecated for Volta and later GPU architectures in favor
+   * of PC Sampling APIs from the header cupti_pcsampling.h which
+   * allows concurrent kernel execution.
+   */
+  CUPTI_ACTIVITY_KIND_PC_SAMPLING = 30,
+
+  /**
+   * Summary information about PC sampling records. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityPCSamplingRecordInfo. In CUDA 12.5, this kind
+   * is deprecated for Volta and later GPU architectures in favor
+   * of PC Sampling APIs from the header cupti_pcsampling.h.
+   */
+  CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO = 31,
+
+  /**
+   * SASS/Source line-by-line correlation record.
+   * This will generate sass/source correlation for functions that have source
+   * level analysis or pc sampling results. The records will be generated only
+   * when either of source level analysis or pc sampling activity is enabled.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstructionCorrelation.
+   * In CUDA 12.6, this kind is deprecated for Volta and later GPU architectures
+   * in favor of SASS Metric APIs from the header cupti_sass_metrics.h.
+   */
+  CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION = 32,
+
+  /**
+   * OpenACC data events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenAccData.
+   */
+  CUPTI_ACTIVITY_KIND_OPENACC_DATA = 33,
+
+  /**
+   * OpenACC launch events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenAccLaunch.
+   */
+  CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH = 34,
+
+  /**
+   * OpenACC other events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenAccOther.
+   */
+  CUPTI_ACTIVITY_KIND_OPENACC_OTHER = 35,
+
+  /**
+   * Information about a CUDA event (cudaEvent). This activity cannot be
+   * directly enabled or disabled. It is enabled and disabled through
+   * the activity CUPTI_ACTIVITY_KIND_SYNCHRONIZATION.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityCudaEvent2.
+   */
+  CUPTI_ACTIVITY_KIND_CUDA_EVENT = 36,
+
+  /**
+   * Information about a CUDA stream. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityStream.
+   */
+  CUPTI_ACTIVITY_KIND_STREAM = 37,
+
+  /**
+   * Records for CUDA synchronization primitives. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivitySynchronization2.
+   */
+  CUPTI_ACTIVITY_KIND_SYNCHRONIZATION = 38,
+
+  /**
+   * Records for correlation of different programming APIs. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityExternalCorrelation.
+   */
+  CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION = 39,
+
+  /**
+   * NVLink topology information.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityNvLink4.
+   */
+  CUPTI_ACTIVITY_KIND_NVLINK = 40,
+
+  /**
+   * Instantaneous Event information.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousEvent.
+   * This activity can not be directly enabled or disabled.
+   * Information collected using the Event API can be stored
+   * in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT = 41,
+
+  /**
+   * Instantaneous Event information for a specific event
+   * domain instance.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousEventInstance.
+   * This activity can not be directly enabled or disabled.
+   * Information collected using the Event API can be stored
+   * in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE = 42,
+
+  /**
+   * Instantaneous Metric information
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousMetric.
+   * This activity cannot be directly enabled or disabled.
+   * Information collected using the Metric API can be stored
+   * in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC = 43,
+
+  /**
+   * Instantaneous Metric information for a specific metric
+   * domain instance.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousMetricInstance.
+   * This activity cannot be directly enabled or disabled.
+   * Information collected using the Metric API can be stored
+   * in the corresponding activity record.
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE = 44,
+
+  /**
+   * Memory activity tracking allocation and freeing of the memory
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemory.
+   */
+  CUPTI_ACTIVITY_KIND_MEMORY = 45,
+
+  /**
+   * PCI devices information used for PCI topology.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityPcie.
+   */
+  CUPTI_ACTIVITY_KIND_PCIE = 46,
+
+  /**
+   * OpenMP parallel events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenMp.
+   */
+  CUPTI_ACTIVITY_KIND_OPENMP = 47,
+
+  /**
+   * A CUDA driver kernel launch occurring outside of any
+   * public API function execution. Tools can handle these
+   * like records for driver API launch functions, although
+   * the cbid field is not used here.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityAPI.
+   */
+  CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API = 48,
+
+  /**
+   * Memory activity tracking allocation and freeing of the memory
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemory4.
+   */
+  CUPTI_ACTIVITY_KIND_MEMORY2 = 49,
+
+  /**
+   * Memory pool activity tracking creation, destruction and
+   * trimming of the memory pool.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemoryPool2.
+   */
+  CUPTI_ACTIVITY_KIND_MEMORY_POOL = 50,
+
+  /**
+   * Activity record for graph-level information.
+   * The corresponding activity record structure is
+   * \ref CUpti_ActivityGraphTrace2.
+   */
+  CUPTI_ACTIVITY_KIND_GRAPH_TRACE = 51,
+
+  /**
+   * JIT (Just-in-time) operation tracking.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityJit.
+   */
+  CUPTI_ACTIVITY_KIND_JIT = 52,
+
+  /**
+   * This activity can not be directly enabled or disabled.
+   * It is enabled when CUPTI_ACTIVITY_KIND_GRAPH_TRACE is enabled
+   * and device graph trace is enabled through API cuptiActivityEnableDeviceGraph().
+   * The corresponding activity record structure is
+   * \ref CUpti_ActivityDeviceGraphTrace.
+   */
+  CUPTI_ACTIVITY_KIND_DEVICE_GRAPH_TRACE = 53,
+
+  /**
+   * Tracing batches of copies that are to be decompressed.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemDecompress.
+   */
+  CUPTI_ACTIVITY_KIND_MEM_DECOMPRESS = 54,
+
+
+
+  /**
+   * Count of supported activity kinds.
+   */
+  CUPTI_ACTIVITY_KIND_COUNT,
+
+  CUPTI_ACTIVITY_KIND_FORCE_INT     = 0x7fffffff
+} CUpti_ActivityKind;
+
+/**
+ * \brief The kinds of activity objects.
+ * \see CUpti_ActivityObjectKindId
+ */
+typedef enum {
+  /**
+   * The object kind is not known.
+   */
+  CUPTI_ACTIVITY_OBJECT_UNKNOWN  = 0,
+
+  /**
+   * A process.
+   */
+  CUPTI_ACTIVITY_OBJECT_PROCESS  = 1,
+
+  /**
+   * A thread.
+   */
+  CUPTI_ACTIVITY_OBJECT_THREAD   = 2,
+
+  /**
+   * A device.
+   */
+  CUPTI_ACTIVITY_OBJECT_DEVICE   = 3,
+
+  /**
+   * A context.
+   */
+  CUPTI_ACTIVITY_OBJECT_CONTEXT  = 4,
+
+  /**
+   * A stream.
+   */
+  CUPTI_ACTIVITY_OBJECT_STREAM   = 5,
+
+  CUPTI_ACTIVITY_OBJECT_FORCE_INT = 0x7fffffff
+} CUpti_ActivityObjectKind;
+
+/**
+ * \brief Identifiers for object kinds as specified by
+ * CUpti_ActivityObjectKind.
+ * \see CUpti_ActivityObjectKind
+ */
+typedef union {
+  /**
+   * A process object requires that we identify the process ID. A
+   * thread object requires that we identify both the process and
+   * thread ID.
+   */
+  struct {
+    uint32_t processId;
+    uint32_t threadId;
+  } pt;
+
+  /**
+   * A device object requires that we identify the device ID. A
+   * context object requires that we identify both the device and
+   * context ID. A stream object requires that we identify device,
+   * context, and stream ID.
+   */
+  struct {
+    uint32_t deviceId;
+    uint32_t contextId;
+    uint32_t streamId;
+  } dcs;
+} CUpti_ActivityObjectKindId;
+
+/**
+ * \brief The structure to provide additional data for CUPTI_ACTIVITY_OVERHEAD_COMMAND_BUFFER_FULL.
+ */
+typedef struct {
+  /**
+   * The remaining space in the command buffer. This field will always be zero
+   * when the command buffer is full, making it not useful in such cases.
+   *
+   */
+  uint32_t commandBufferLength;
+  /**
+   * The channel ID of the command buffer.
+   *
+   */
+  uint32_t channelID;
+  /**
+   * The channel type of the command buffer.
+   *
+   */
+  uint32_t channelType;
+} CUpti_ActivityOverheadCommandBufferFullData;
+
+/**
+ * \brief The kinds of activity overhead.
+ */
+typedef enum {
+  /**
+   * The overhead kind is not known.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_UNKNOWN               = 0,
+
+  /**
+   * Compiler overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER       = 1,
+
+  /**
+   * Activity buffer flush overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH               = 1<<16,
+
+  /**
+   * CUPTI instrumentation overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION            = 2<<16,
+
+  /**
+   * CUPTI resource creation and destruction overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE                   = 3<<16,
+
+  /**
+   * CUDA Runtime triggered module loading overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_RUNTIME_TRIGGERED_MODULE_LOADING = 4<<16,
+
+  /**
+   * Lazy function loading overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_LAZY_FUNCTION_LOADING            = 5<<16,
+
+  /**
+   * Overhead due to lack of command buffer space.
+   * Refer CUpti_ActivityOverheadCommandBufferFullData for more details.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_COMMAND_BUFFER_FULL              = 6<<16,
+
+  /**
+   * Overhead due to activity buffer request.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_ACTIVITY_BUFFER_REQUEST          = 7<<16,
+
+  /**
+    * Overhead due to UVM activity initialization.
+    */
+   CUPTI_ACTIVITY_OVERHEAD_UVM_ACTIVITY_INIT                = 8<<16,
+
+  CUPTI_ACTIVITY_OVERHEAD_FORCE_INT             = 0x7fffffff
+} CUpti_ActivityOverheadKind;
+
+/**
+ * \brief The kind of a compute API.
+ */
+typedef enum {
+  /**
+   * The compute API is not known.
+   */
+  CUPTI_ACTIVITY_COMPUTE_API_UNKNOWN    = 0,
+
+  /**
+   * The compute APIs are for CUDA.
+   */
+  CUPTI_ACTIVITY_COMPUTE_API_CUDA       = 1,
+
+  /**
+   * The compute APIs are for CUDA running
+   * in MPS (Multi-Process Service) environment.
+   */
+  CUPTI_ACTIVITY_COMPUTE_API_CUDA_MPS   = 2,
+
+  CUPTI_ACTIVITY_COMPUTE_API_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityComputeApiKind;
+
+/**
+ * \brief Flags associated with activity records.
+ *
+ * Activity record flags. Flags can be combined by bitwise OR to
+ * associated multiple flags with an activity record. Each flag is
+ * specific to a certain activity kind, as noted below.
+ */
+typedef enum {
+  /**
+   * Indicates the activity record has no flags.
+   */
+  CUPTI_ACTIVITY_FLAG_NONE          = 0,
+
+  /**
+   * Indicates the activity represents a device that supports
+   * concurrent kernel execution. Valid for
+   * CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUPTI_ACTIVITY_FLAG_DEVICE_CONCURRENT_KERNELS  = 1 << 0,
+
+  /**
+   * Indicates if the activity represents a CUdevice_attribute value
+   * or a CUpti_DeviceAttribute value. Valid for
+   * CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE.
+   */
+  CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE  = 1 << 0,
+
+  /**
+   * Indicates the activity represents an asynchronous memcpy
+   * operation. Valid for CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC  = 1 << 0,
+
+  /**
+   * Indicates the activity represents an instantaneous marker. Valid
+   * for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_INSTANTANEOUS  = 1 << 0,
+
+  /**
+   * Indicates the activity represents a region start marker. Valid
+   * for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_START  = 1 << 1,
+
+  /**
+   * Indicates the activity represents a region end marker. Valid for
+   * CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_END  = 1 << 2,
+
+  /**
+   * Indicates the activity represents an attempt to acquire a user
+   * defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE = 1 << 3,
+
+  /**
+   * Indicates the activity represents success in acquiring the
+   * user defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_SUCCESS = 1 << 4,
+
+  /**
+   * Indicates the activity represents failure in acquiring the
+   * user defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_FAILED = 1 << 5,
+
+  /**
+   * Indicates the activity represents releasing a reservation on
+   * user defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_RELEASE = 1 << 6,
+
+  /**
+   * Indicates the activity represents a marker that does not specify
+   * a color. Valid for CUPTI_ACTIVITY_KIND_MARKER_DATA.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_COLOR_NONE  = 1 << 0,
+
+  /**
+   * Indicates the activity represents a marker that specifies a color
+   * in alpha-red-green-blue format. Valid for
+   * CUPTI_ACTIVITY_KIND_MARKER_DATA.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_COLOR_ARGB  = 1 << 1,
+
+  /**
+   * The number of bytes requested by each thread
+   * Valid for CUpti_ActivityGlobalAccess3.
+   */
+  CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_SIZE_MASK  = 0xFF << 0,
+
+  /**
+   * If bit in this flag is set, the access was load, else it is a
+   * store access. Valid for CUpti_ActivityGlobalAccess3.
+   */
+  CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_LOAD       = 1 << 8,
+
+  /**
+   * If this bit in flag is set, the load access was cached else it is
+   * uncached. Valid for CUpti_ActivityGlobalAccess3.
+   */
+  CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_CACHED     = 1 << 9,
+
+  /**
+   * If this bit in flag is set, the metric value overflowed. Valid
+   * for CUpti_ActivityMetric and CUpti_ActivityMetricInstance.
+   */
+  CUPTI_ACTIVITY_FLAG_METRIC_OVERFLOWED     = 1 << 0,
+
+  /**
+   * If this bit in flag is set, the metric value couldn't be
+   * calculated. This occurs when a value(s) required to calculate the
+   * metric is missing.  Valid for CUpti_ActivityMetric and
+   * CUpti_ActivityMetricInstance.
+   */
+  CUPTI_ACTIVITY_FLAG_METRIC_VALUE_INVALID  = 1 << 1,
+
+  /**
+   * If this bit in flag is set, the source level metric value couldn't be
+   * calculated. This occurs when a value(s) required to calculate the
+   * source level metric cannot be evaluated.
+   * Valid for CUpti_ActivityInstructionExecution.
+   */
+  CUPTI_ACTIVITY_FLAG_INSTRUCTION_VALUE_INVALID  = 1 << 0,
+
+  /**
+   * The mask for the instruction class, \ref CUpti_ActivityInstructionClass
+   * Valid for CUpti_ActivityInstructionExecution and
+   * CUpti_ActivityInstructionCorrelation
+   */
+  CUPTI_ACTIVITY_FLAG_INSTRUCTION_CLASS_MASK    = 0xFF << 1,
+
+  /**
+   * When calling cuptiActivityFlushAll, this flag
+   * can be set to force CUPTI to flush all records in the buffer, whether
+   * finished or not
+   */
+  CUPTI_ACTIVITY_FLAG_FLUSH_FORCED = 1 << 0,
+
+  /**
+   * The number of bytes requested by each thread
+   * Valid for CUpti_ActivitySharedAccess.
+   */
+  CUPTI_ACTIVITY_FLAG_SHARED_ACCESS_KIND_SIZE_MASK  = 0xFF << 0,
+
+  /**
+   * If bit in this flag is set, the access was load, else it is a
+   * store access.  Valid for CUpti_ActivitySharedAccess.
+   */
+  CUPTI_ACTIVITY_FLAG_SHARED_ACCESS_KIND_LOAD       = 1 << 8,
+
+  /**
+   * Indicates the activity represents an asynchronous memset
+   * operation. Valid for CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC  = 1 << 0,
+
+  /**
+   * Indicates the activity represents thrashing in CPU.
+   * Valid for counter of kind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING in
+   * CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUPTI_ACTIVITY_FLAG_THRASHING_IN_CPU = 1 << 0,
+
+  /**
+   * Indicates the activity represents page throttling in CPU.
+   * Valid for counter of kind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING in
+   * CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUPTI_ACTIVITY_FLAG_THROTTLING_IN_CPU = 1 << 0,
+
+  CUPTI_ACTIVITY_FLAG_FORCE_INT = 0x7fffffff
+} CUpti_ActivityFlag;
+
+/**
+ * \brief The stall reason for PC sampling activity.
+ */
+typedef enum {
+  /**
+   * Invalid reason
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_INVALID      = 0,
+
+  /**
+   * No stall, instruction is selected for issue
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_NONE         = 1,
+
+  /**
+   * Warp is blocked because next instruction is not yet available,
+   * because of instruction cache miss, or because of branching effects
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_INST_FETCH   = 2,
+
+  /**
+   * Instruction is waiting on an arithmetic dependency
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_EXEC_DEPENDENCY   = 3,
+
+  /**
+   * Warp is blocked because it is waiting for a memory access to complete.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_DEPENDENCY   = 4,
+
+  /**
+   * Texture sub-system is fully utilized or has too many outstanding requests.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_TEXTURE   = 5,
+
+  /**
+   * Warp is blocked as it is waiting at __syncthreads() or at memory barrier.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_SYNC   = 6,
+
+  /**
+   * Warp is blocked waiting for __constant__ memory and immediate memory access to complete.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_CONSTANT_MEMORY_DEPENDENCY   = 7,
+
+  /**
+   * Compute operation cannot be performed due to the required resources not
+   * being available.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_PIPE_BUSY   = 8,
+
+  /**
+   * Warp is blocked because there are too many pending memory operations.
+   * In Kepler architecture it often indicates high number of memory replays.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_THROTTLE   = 9,
+
+  /**
+   * Warp was ready to issue, but some other warp issued instead.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_NOT_SELECTED   = 10,
+
+  /**
+   * Miscellaneous reasons
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_OTHER   = 11,
+
+  /**
+   * Sleeping.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_SLEEPING   = 12,
+
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityPCSamplingStallReason;
+
+/**
+ * \brief Sampling period for PC sampling method
+ *
+ * Sampling period can be set using \ref cuptiActivityConfigurePCSampling
+ */
+typedef enum {
+  /**
+   * The PC sampling period is not set.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_INVALID = 0,
+
+  /**
+   * Minimum sampling period available on the device.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MIN = 1,
+
+  /**
+   * Sampling period in lower range.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_LOW = 2,
+
+  /**
+   * Medium sampling period.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MID = 3,
+
+  /**
+   * Sampling period in higher range.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_HIGH = 4,
+
+  /**
+   * Maximum sampling period available on the device.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MAX = 5,
+
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_FORCE_INT = 0x7fffffff
+} CUpti_ActivityPCSamplingPeriod;
+
+/**
+ * \brief The kind of a memory copy, indicating the source and
+ * destination targets of the copy.
+ *
+ * Each kind represents the source and destination targets of a memory
+ * copy. Targets are host, device, and array.
+ */
+typedef enum {
+  /**
+   * The memory copy kind is not known.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN = 0,
+
+  /**
+   * A host to device memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_HTOD    = 1,
+
+  /**
+   * A device to host memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_DTOH    = 2,
+
+  /**
+   * A host to device array memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_HTOA    = 3,
+
+  /**
+   * A device array to host memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_ATOH    = 4,
+
+  /**
+   * A device array to device array memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_ATOA    = 5,
+
+  /**
+   * A device array to device memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_ATOD    = 6,
+
+  /**
+   * A device to device array memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_DTOA    = 7,
+
+  /**
+   * A device to device memory copy on the same device.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_DTOD    = 8,
+
+  /**
+   * A host to host memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_HTOH    = 9,
+
+  /**
+   * A peer to peer memory copy across different devices.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_PTOP    = 10,
+
+  CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT = 0x7fffffff
+} CUpti_ActivityMemcpyKind;
+
+/**
+ * \brief The kinds of memory accessed by a memory operation/copy.
+ *
+ * Each kind represents the type of the memory
+ * accessed by a memory operation/copy.
+ */
+typedef enum {
+  /**
+   * The memory kind is unknown.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN            = 0,
+
+  /**
+   * The memory is pageable.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE           = 1,
+
+  /**
+   * The memory is pinned.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_PINNED             = 2,
+
+  /**
+   * The memory is on the device.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_DEVICE             = 3,
+
+  /**
+   * The memory is an array.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_ARRAY              = 4,
+
+  /**
+   * The memory is managed
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_MANAGED            = 5,
+
+  /**
+   * The memory is device static
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC      = 6,
+
+  /**
+   * The memory is managed static
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC     = 7,
+
+  CUPTI_ACTIVITY_MEMORY_KIND_FORCE_INT          = 0x7fffffff
+} CUpti_ActivityMemoryKind;
+
+/**
+ * \brief The kind of a preemption activity.
+ */
+typedef enum {
+  /**
+   * The preemption kind is not known.
+   */
+  CUPTI_ACTIVITY_PREEMPTION_KIND_UNKNOWN    = 0,
+
+  /**
+   * Preemption to save CDP block.
+   */
+  CUPTI_ACTIVITY_PREEMPTION_KIND_SAVE       = 1,
+
+  /**
+   * Preemption to restore CDP block.
+   */
+  CUPTI_ACTIVITY_PREEMPTION_KIND_RESTORE    = 2,
+
+  CUPTI_ACTIVITY_PREEMPTION_KIND_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityPreemptionKind;
+
+/**
+ * \brief The kind of environment data. Used to indicate what type of
+ * data is being reported by an environment activity record.
+ */
+typedef enum {
+  /**
+   * Unknown data.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_UNKNOWN = 0,
+
+  /**
+   * The environment data is related to speed.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_SPEED = 1,
+
+  /**
+   * The environment data is related to temperature.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_TEMPERATURE = 2,
+
+  /**
+   * The environment data is related to power.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_POWER = 3,
+
+  /**
+   * The environment data is related to cooling.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_COOLING = 4,
+
+  CUPTI_ACTIVITY_ENVIRONMENT_COUNT,
+
+  CUPTI_ACTIVITY_ENVIRONMENT_KIND_FORCE_INT    = 0x7fffffff
+} CUpti_ActivityEnvironmentKind;
+
+/**
+ * \brief Reasons for clock throttling.
+ *
+ * The possible reasons that a clock can be throttled. There can be
+ * more than one reason that a clock is being throttled so these types
+ * can be combined by bitwise OR.  These are used in the
+ * clocksThrottleReason field in the Environment Activity Record.
+ */
+typedef enum {
+  /**
+   * Nothing is running on the GPU and the clocks are dropping to idle
+   * state.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_GPU_IDLE              = 0x00000001,
+
+  /**
+   * The GPU clocks are limited by a user specified limit.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_USER_DEFINED_CLOCKS   = 0x00000002,
+
+  /**
+   * A software power scaling algorithm is reducing the clocks below
+   * requested clocks.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_SW_POWER_CAP          = 0x00000004,
+
+  /**
+   * Hardware slowdown to reduce the clock by a factor of two or more
+   * is engaged.  This is an indicator of one of the following: 1)
+   * Temperature is too high, 2) External power brake assertion is
+   * being triggered (e.g. by the system power supply), 3) Change in
+   * power state.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN           = 0x00000008,
+
+  /**
+   * Some unspecified factor is reducing the clocks.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_UNKNOWN               = 0x80000000,
+
+  /**
+   * Throttle reason is not supported for this GPU.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_UNSUPPORTED           = 0x40000000,
+
+  /**
+   * No clock throttling.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_NONE                  = 0x00000000,
+
+  CUPTI_CLOCKS_THROTTLE_REASON_FORCE_INT             = 0x7fffffff
+} CUpti_EnvironmentClocksThrottleReason;
+
+/**
+ * \brief Scope of the unified memory counter (deprecated in CUDA 7.0)
+ */
+typedef enum {
+  /**
+   * The unified memory counter scope is not known.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_UNKNOWN = 0,
+
+  /**
+   * Collect unified memory counter for single process on one device
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE = 1,
+
+  /**
+   * Collect unified memory counter for single process across all devices
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_ALL_DEVICES = 2,
+
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_COUNT,
+
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityUnifiedMemoryCounterScope;
+
+/**
+ * \brief Kind of the Unified Memory counter
+ *
+ * Many activities are associated with Unified Memory mechanism; among them
+ * are transfers from host to device, device to host, page fault at
+ * host side.
+ */
+typedef enum {
+  /**
+   * The unified memory counter kind is not known.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_UNKNOWN = 0,
+
+  /**
+   * Number of bytes transferred from host to device
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD = 1,
+
+  /**
+   * Number of bytes transferred from device to host
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH = 2,
+
+  /**
+   * Number of CPU page faults, this is only supported on 64 bit
+   * Linux and Mac platforms
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT = 3,
+
+  /**
+   * Number of GPU page faults, this is only supported on devices with
+   * compute capability 6.0 and higher and 64 bit Linux platforms
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT = 4,
+
+  /**
+   * Thrashing occurs when data is frequently accessed by
+   * multiple processors and has to be constantly migrated around
+   * to achieve data locality. In this case the overhead of migration
+   * may exceed the benefits of locality.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING = 5,
+
+  /**
+   * Throttling is a prevention technique used by the driver to avoid
+   * further thrashing. Here, the driver doesn't service the fault for
+   * one of the contending processors for a specific period of time,
+   * so that the other processor can run at full-speed.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING = 6,
+
+  /**
+   * In case throttling does not help, the driver tries to pin the memory
+   * to a processor for a specific period of time. One of the contending
+   * processors will have slow  access to the memory, while the other will
+   * have fast access.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP = 7,
+
+  /**
+   * Number of bytes transferred from one device to another device.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD = 8,
+
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_COUNT,
+
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_FORCE_INT = 0x7fffffff
+} CUpti_ActivityUnifiedMemoryCounterKind;
+
+/**
+ * \brief Memory access type for unified memory page faults
+ *
+ * This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT
+ * and \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT
+ */
+typedef enum {
+  /**
+   * The unified memory access type is not known
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_UNKNOWN = 0,
+
+  /**
+   * The page fault was triggered by read memory instruction
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_READ = 1,
+
+  /**
+   * The page fault was triggered by write memory instruction
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_WRITE = 2,
+
+  /**
+   * The page fault was triggered by atomic memory instruction
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_ATOMIC = 3,
+
+  /**
+   * The page fault was triggered by memory prefetch operation
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_PREFETCH = 4
+} CUpti_ActivityUnifiedMemoryAccessType;
+
+/**
+ * \brief Migration cause of the Unified Memory counter
+ *
+ * This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+ * \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH
+ */
+typedef enum {
+  /**
+   * The unified memory migration cause is not known
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_UNKNOWN = 0,
+
+  /**
+   * The unified memory migrated due to an explicit call from
+   * the user e.g. cudaMemPrefetchAsync
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_USER = 1,
+
+  /**
+   * The unified memory migrated to guarantee data coherence
+   * e.g. CPU/GPU faults on Pascal+ and kernel launch on pre-Pascal GPUs
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_COHERENCE = 2,
+
+  /**
+   * The unified memory was speculatively migrated by the UVM driver
+   * before being accessed by the destination processor to improve
+   * performance
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_PREFETCH = 3,
+
+  /**
+   * The unified memory migrated to the CPU because it was evicted to make
+   * room for another block of memory on the GPU
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_EVICTION = 4,
+
+  /**
+    * The unified memory migrated to another processor because of access counter
+    * notifications. Only frequently accessed pages are migrated between CPU and GPU, or
+    * between peer GPUs.
+    */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_ACCESS_COUNTERS = 5,
+} CUpti_ActivityUnifiedMemoryMigrationCause;
+
+/**
+ * \brief Remote memory map cause of the Unified Memory counter
+ *
+ * This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP
+ */
+typedef enum {
+  /**
+   * The cause of mapping to remote memory was unknown
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_UNKNOWN = 0,
+
+  /**
+   * Mapping to remote memory was added to maintain data coherence.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_COHERENCE = 1,
+
+  /**
+   * Mapping to remote memory was added to prevent further thrashing
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_THRASHING = 2,
+
+  /**
+   * Mapping to remote memory was added to enforce the hints
+   * specified by the programmer or by performance heuristics of the
+   * UVM driver
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_POLICY = 3,
+
+  /**
+   * Mapping to remote memory was added because there is no more
+   * memory available on the processor and eviction was not
+   * possible
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_OUT_OF_MEMORY = 4,
+
+  /**
+   * Mapping to remote memory was added after the memory was
+   * evicted to make room for another block of memory on the GPU
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_EVICTION = 5,
+} CUpti_ActivityUnifiedMemoryRemoteMapCause;
+
+/**
+ * \brief SASS instruction classification.
+ *
+ * The sass instruction are broadly divided into different class. Each enum represents a classification.
+ */
+typedef enum {
+  /**
+   * The instruction class is not known.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_UNKNOWN = 0,
+
+  /**
+   * Represents a 32 bit floating point operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_32 = 1,
+
+  /**
+   * Represents a 64 bit floating point operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_64 = 2,
+
+  /**
+   * Represents an integer operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_INTEGER = 3,
+
+  /**
+   * Represents a bit conversion operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_BIT_CONVERSION = 4,
+
+  /**
+   * Represents a control flow instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_CONTROL_FLOW = 5,
+
+  /**
+   * Represents a global load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_GLOBAL = 6,
+
+  /**
+   * Represents a shared load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SHARED = 7,
+
+  /**
+   * Represents a local load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_LOCAL = 8,
+
+  /**
+   * Represents a generic load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_GENERIC = 9,
+
+  /**
+   * Represents a surface load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SURFACE = 10,
+
+  /**
+   * Represents a constant load instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_CONSTANT = 11,
+
+  /**
+   * Represents a texture load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_TEXTURE = 12,
+
+  /**
+   * Represents a global atomic instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_GLOBAL_ATOMIC = 13,
+
+  /**
+   * Represents a shared atomic instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SHARED_ATOMIC = 14,
+
+  /**
+   * Represents a surface atomic instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SURFACE_ATOMIC = 15,
+
+  /**
+   * Represents a inter-thread communication instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_INTER_THREAD_COMMUNICATION = 16,
+
+  /**
+   * Represents a barrier instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_BARRIER = 17,
+
+  /**
+   * Represents some miscellaneous instructions which do not fit in the above classification.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_MISCELLANEOUS = 18,
+
+  /**
+   * Represents a 16 bit floating point operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_16 = 19,
+
+  /**
+   * Represents uniform instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_UNIFORM = 20,
+
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_KIND_FORCE_INT     = 0x7fffffff
+} CUpti_ActivityInstructionClass;
+
+/**
+ * \brief Partitioned global caching option
+ */
+typedef enum {
+  /**
+   * Partitioned global cache config unknown.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_UNKNOWN       = 0,
+
+  /**
+   * Partitioned global cache not supported.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_NOT_SUPPORTED = 1,
+
+  /**
+   * Partitioned global cache config off.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_OFF           = 2,
+
+  /**
+   * Partitioned global cache config on.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_ON            = 3,
+
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_FORCE_INT     = 0x7fffffff
+} CUpti_ActivityPartitionedGlobalCacheConfig;
+
+/**
+ * \brief Synchronization type.
+ *
+ * The types of synchronization to be used with
+ * CUpti_ActivitySynchronization2.
+ */
+
+typedef enum {
+  /**
+   * Unknown data.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_UNKNOWN             = 0,
+
+  /**
+   * Event synchronize API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE   = 1,
+
+  /**
+   * Stream wait event API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT   = 2,
+
+  /**
+   * Stream synchronize API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE  = 3,
+
+  /**
+   * Context synchronize API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_CONTEXT_SYNCHRONIZE = 4,
+
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_FORCE_INT           = 0x7fffffff
+} CUpti_ActivitySynchronizationType;
+
+/**
+ * \brief stream type.
+ *
+ * The types of stream to be used with CUpti_ActivityStream.
+ */
+
+typedef enum {
+  /**
+   * Unknown data.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_UNKNOWN      = 0,
+
+  /**
+   * Default stream.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_DEFAULT      = 1,
+
+  /**
+   * Non-blocking stream.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_NON_BLOCKING = 2,
+
+  /**
+   * Null stream.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_NULL         = 3,
+
+  /**
+   * Stream create Mask
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_MASK              = 0xFFFF,
+
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_FORCE_INT    = 0x7fffffff
+} CUpti_ActivityStreamFlag;
+
+/**
+* \brief Link flags.
+*
+* Describes link properties, to be used with CUpti_ActivityNvLink.
+*/
+
+typedef enum {
+  /**
+   * The flag is invalid.
+   */
+  CUPTI_LINK_FLAG_INVALID        = 0,
+
+  /**
+  * Is peer to peer access supported by this link.
+  */
+  CUPTI_LINK_FLAG_PEER_ACCESS    = (1 << 1),
+
+  /**
+  * Is system memory access supported by this link.
+  */
+  CUPTI_LINK_FLAG_SYSMEM_ACCESS  = (1 << 2),
+
+  /**
+  * Is peer atomic access supported by this link.
+  */
+  CUPTI_LINK_FLAG_PEER_ATOMICS   = (1 << 3),
+
+  /**
+  * Is system memory atomic access supported by this link.
+  */
+  CUPTI_LINK_FLAG_SYSMEM_ATOMICS = (1 << 4),
+
+  CUPTI_LINK_FLAG_FORCE_INT = 0x7fffffff
+} CUpti_LinkFlag;
+
+/**
+* \brief Memory operation types.
+*
+* Describes the type of memory operation, to be used with CUpti_ActivityMemory4.
+*/
+
+typedef enum {
+  /**
+   * The operation is invalid.
+   */
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_INVALID   = 0,
+
+  /**
+  * Memory is allocated.
+  */
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_ALLOCATION = 1,
+
+  /**
+  * Memory is released.
+  */
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_RELEASE    = 2,
+
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityMemoryOperationType;
+
+/**
+* \brief Memory pool types.
+*
+* Describes the type of memory pool, to be used with CUpti_ActivityMemory4.
+*/
+
+typedef enum {
+  /**
+   * The operation is invalid.
+   */
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_INVALID   = 0,
+
+  /**
+  * Memory pool is local to the process.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL     = 1,
+
+  /**
+  * Memory pool is imported by the process.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED  = 2,
+
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityMemoryPoolType;
+
+/**
+* \brief Memory pool operation types.
+*
+* Describes the type of memory pool operation, to be used with CUpti_ActivityMemoryPool2.
+*/
+
+typedef enum {
+  /**
+   * The operation is invalid.
+   */
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_INVALID   = 0,
+
+  /**
+  * Memory pool is created.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_CREATED   = 1,
+
+  /**
+  * Memory pool is destroyed.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_DESTROYED = 2,
+
+  /**
+  * Memory pool is trimmed.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED   = 3,
+
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityMemoryPoolOperationType;
+
+typedef enum {
+  CUPTI_CHANNEL_TYPE_INVALID      = 0,
+
+  /**
+   * Channel is used for standard work launch and tracking
+   */
+  CUPTI_CHANNEL_TYPE_COMPUTE      = 1,
+
+  /**
+   * Channel is used by an asynchronous copy engine
+   * For confidential compute configurations, work launch and
+   * completion are done using the copy engines.
+   */
+  CUPTI_CHANNEL_TYPE_ASYNC_MEMCPY = 2,
+
+
+  /**
+   * Channel is used for memory decompression operations
+   */
+    CUPTI_CHANNEL_TYPE_DECOMP ,
+
+  CUPTI_CHANNEL_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ChannelType;
+
+/**
+* \brief CIG (CUDA in Graphics) Modes.
+*
+* Describes the CIG modes associated with the CUDA context.
+*/
+
+typedef enum
+{
+  /**
+   * Regular (non-CIG) mode
+   */
+  CUPTI_CONTEXT_CIG_MODE_NONE         = 0,
+  /**
+   * CIG mode
+   */
+  CUPTI_CONTEXT_CIG_MODE_CIG          = 1,
+  /**
+   * CIG fallback mode
+   */
+  CUPTI_CONTEXT_CIG_MODE_CIG_FALLBACK = 2,
+
+  CUPTI_CONTEXT_CIG_MODE_FORCE_INT    = 0x7fffffff
+} CUpti_ContextCigMode;
+
+/**
+ * The source-locator ID that indicates an unknown source
+ * location. There is not an actual CUpti_ActivitySourceLocator object
+ * corresponding to this value.
+ */
+#define CUPTI_SOURCE_LOCATOR_ID_UNKNOWN 0
+
+/**
+ * An invalid function index ID.
+ */
+#define CUPTI_FUNCTION_INDEX_ID_INVALID 0
+
+/**
+ * An invalid/unknown correlation ID. A correlation ID of this value
+ * indicates that there is no correlation for the activity record.
+ */
+#define CUPTI_CORRELATION_ID_UNKNOWN 0
+
+/**
+ * An invalid/unknown grid ID.
+ */
+#define CUPTI_GRID_ID_UNKNOWN 0LL
+
+/**
+ * An invalid/unknown timestamp for a start, end, queued, submitted,
+ * or completed time.
+ */
+#define CUPTI_TIMESTAMP_UNKNOWN 0LL
+
+/**
+ * An invalid/unknown value.
+ */
+#define CUPTI_SYNCHRONIZATION_INVALID_VALUE ((uint32_t) 0xFFFFFFFFU)
+
+/**
+ * An invalid/unknown process id.
+ */
+#define CUPTI_AUTO_BOOST_INVALID_CLIENT_PID 0
+
+/**
+ * Invalid/unknown NVLink port number.
+*/
+#define CUPTI_NVLINK_INVALID_PORT -1
+
+/**
+ * Maximum NVLink port numbers.
+*/
+#define CUPTI_MAX_NVLINK_PORTS 32
+
+/**
+ * An invalid/unknown value for decompressed bytes.
+*/
+#define CUPTI_DECOMPRESSED_BYTES_UNKNOWN 0LL
+
+START_PACKED_ALIGNMENT
+/**
+ * \brief Unified Memory counters configuration structure
+ *
+ * This structure controls the enable/disable of the various
+ * Unified Memory counters consisting of scope, kind and other parameters.
+ * See function \ref cuptiActivityConfigureUnifiedMemoryCounter
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Unified Memory counter Counter scope. (deprecated in CUDA 7.0)
+   */
+  CUpti_ActivityUnifiedMemoryCounterScope scope;
+
+  /**
+   * Unified Memory counter Counter kind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind kind;
+
+  /**
+   * Device id of the target device. This is relevant only
+   * for single device scopes. (deprecated in CUDA 7.0)
+   */
+  uint32_t deviceId;
+
+  /**
+   * Control to enable/disable the counter. To enable the counter
+   * set it to non-zero value while disable is indicated by zero.
+   */
+  uint32_t enable;
+} CUpti_ActivityUnifiedMemoryCounterConfig;
+
+/**
+ * \brief Device auto boost state structure
+ *
+ * This structure defines auto boost state for a device.
+ * See function \ref cuptiGetAutoBoostState
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Returned auto boost state. 1 is returned in case auto boost is enabled, 0
+   * otherwise
+   */
+  uint32_t enabled;
+
+  /**
+   * Id of process that has set the current boost state. The value will be
+   * CUPTI_AUTO_BOOST_INVALID_CLIENT_PID if the user does not have the
+   * permission to query process ids or there is an error in querying the
+   * process id.
+   */
+  uint32_t pid;
+
+} CUpti_ActivityAutoBoostState;
+
+/**
+ * \brief PC sampling configuration structure
+ *
+ * This structure defines the pc sampling configuration.
+ *
+ * See function \ref cuptiActivityConfigurePCSampling
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Size of configuration structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  uint32_t size;
+
+  /**
+   * There are 5 level provided for sampling period. The level
+   * internally maps to a period in terms of cycles. Same level can
+   * map to different number of cycles on different gpus. No of
+   * cycles will be chosen to minimize information loss. The period
+   * chosen will be given by samplingPeriodInCycles in
+   * \ref CUpti_ActivityPCSamplingRecordInfo for each kernel instance.
+   */
+  CUpti_ActivityPCSamplingPeriod samplingPeriod;
+
+  /**
+   * This will override the period set by samplingPeriod. Value 0 in samplingPeriod2 will be
+   * considered as samplingPeriod2 should not be used and samplingPeriod should be used.
+   * Valid values for samplingPeriod2 are between 5 to 31 both inclusive.
+   * This will set the sampling period to (2^samplingPeriod2) cycles.
+   */
+  uint32_t samplingPeriod2;
+} CUpti_ActivityPCSamplingConfig;
+
+/**
+ * \brief The base activity record.
+ *
+ * The activity API uses a CUpti_Activity as a generic representation
+ * for any activity. The 'kind' field is used to determine the
+ * specific activity kind, and from that the CUpti_Activity object can
+ * be cast to the specific activity record type appropriate for that kind.
+ *
+ * Note that all activity record types are padded and aligned to
+ * ensure that each member of the record is naturally aligned.
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+} CUpti_Activity;
+
+/**
+ * \brief The activity record for memory copies.
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the HW channel on which the memory copy is occurring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   *  Reserved for internal use.
+   */
+  uint32_t pad2;
+
+  /**
+   * The total number of memcopy operations traced in this record.
+   * This field is valid for memcpy operations happening using
+   * MemcpyBatchAsync APIs in CUDA.
+   * In MemcpyBatchAsync APIs, multiple memcpy operations are batched
+   * together for optimization purposes based on certain heuristics.
+   * For other memcpy operations, this field will be 1.
+   */
+   uint64_t copyCount;
+} CUpti_ActivityMemcpy6;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the HW channel on which the memory copy is occurring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+} CUpti_ActivityMemcpyPtoP4;
+
+/**
+ * \brief The activity record for memset.
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the HW channel on which the memory set is occurring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   *  Undefined. Reserved for internal use
+   */
+  uint32_t pad2;
+} CUpti_ActivityMemset4;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY).
+ * This activity record provides a single record for the memory
+ * allocation and memory release operations.
+ *
+ * Note: It is recommended to move to the new activity record \ref CUpti_ActivityMemory4
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY2.
+ * \ref CUpti_ActivityMemory4 provides separate records for memory
+ * allocation and memory release operations. This allows to correlate the
+ * corresponding driver and runtime API activity record with the memory operation.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory kind requested by the user
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The virtual address of the allocation
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, i.e.
+   * the time when memory was allocated, in ns.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory operation, i.e.
+   * the time when memory was freed, in ns.
+   * This will be 0 if memory is not freed in the application
+   */
+  uint64_t end;
+
+  /**
+   * The program counter of the allocation of memory
+   */
+  uint64_t allocPC;
+
+  /**
+   * The program counter of the freeing of memory. This will
+   * be 0 if memory is not freed in the application
+   */
+  uint64_t freePC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory allocation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+} CUpti_ActivityMemory;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY2).
+ * This activity record provides separate records for memory allocation and
+ * memory release operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory operation.
+ *
+ * Note: This activity record is an upgrade over \ref CUpti_ActivityMemory
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY.
+ * \ref CUpti_ActivityMemory provides a single record for the memory
+ * allocation and memory release operations.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType.
+   */
+  CUpti_ActivityMemoryOperationType memoryOperationType;
+
+  /**
+   * The memory kind requested by the user, \ref CUpti_ActivityMemoryKind.
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The correlation ID of the memory operation. Each memory operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The program counter of the memory operation.
+   */
+  uint64_t PC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory operation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID.
+   */
+  uint32_t streamId;
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+
+  /**
+   * \p isAsync is set if memory operation happens through async memory APIs.
+   */
+  uint32_t isAsync;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /**
+   * The memory pool configuration used for the memory operations.
+   */
+  struct PACKED_ALIGNMENT {
+    /**
+     * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+     */
+    CUpti_ActivityMemoryPoolType memoryPoolType;
+
+#ifdef CUPTILP64
+    /**
+     * Undefined. Reserved for internal use.
+     */
+    uint32_t pad2;
+#endif
+
+    /**
+     * The base address of the memory pool.
+     */
+    uint64_t address;
+
+    /**
+     * The release threshold of the memory pool in bytes. \p releaseThreshold is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t releaseThreshold;
+
+    /**
+     * The size of memory pool in bytes and the processId of the memory pools
+     * \p size is valid if \p memoryPoolType is
+     * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     * \p processId is valid if \p memoryPoolType is
+     * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType
+     */
+    union {
+      uint64_t size;
+      uint64_t processId;
+    } pool;
+
+    /**
+     * The utilized size of the memory pool. \p utilizedSize is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t utilizedSize;
+  } memoryPoolConfig;
+
+    /**
+     * The shared object or binary that the memory allocation request comes from.
+     */
+    const char* source;
+} CUpti_ActivityMemory4;
+
+/**
+ * \brief The activity record for memory pool.
+ *
+ * This activity record represents a memory pool creation, destruction and
+ * trimming (CUPTI_ACTIVITY_KIND_MEMORY_POOL).
+ * This activity record provides separate records for memory pool creation,
+ * destruction and trimming operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory pool operation.
+ *
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY_POOL
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryPoolOperationType.
+   */
+  CUpti_ActivityMemoryPoolOperationType memoryPoolOperationType;
+
+  /**
+   * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+   */
+  CUpti_ActivityMemoryPoolType memoryPoolType;
+
+  /**
+   * The correlation ID of the memory pool operation. Each memory pool
+   * operation is assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory pool is created.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The minimum bytes to keep of the memory pool. \p minBytesToKeep is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED,
+   * \ref CUpti_ActivityMemoryPoolOperationType
+   */
+  size_t minBytesToKeep;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The size of the memory pool operation in bytes. \p size is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t size;
+
+  /**
+   * The release threshold of the memory pool. \p releaseThreshold is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t releaseThreshold;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The utilized size of the memory pool. \p utilizedSize is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t utilizedSize;
+} CUpti_ActivityMemoryPool2;
+
+/**
+ * \brief The type of the CUDA kernel launch.
+ */
+typedef enum {
+  /**
+  * The kernel was launched via a regular kernel call
+  */
+  CUPTI_ACTIVITY_LAUNCH_TYPE_REGULAR = 0,
+
+  /**
+  * The kernel was launched via API \ref cudaLaunchCooperativeKernel() or
+  * \ref cuLaunchCooperativeKernel()
+  */
+  CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_SINGLE_DEVICE = 1,
+
+  /**
+  * The kernel was launched via API \ref cudaLaunchCooperativeKernelMultiDevice() or
+  * \ref cuLaunchCooperativeKernelMultiDevice()
+  */
+  CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_MULTI_DEVICE = 2,
+
+  /**
+  * The kernel was launched as a CBL commandlist
+  */
+  CUPTI_ACTIVITY_LAUNCH_TYPE_CBL_COMMANDLIST = 3,
+} CUpti_ActivityLaunchType;
+
+/**
+ * \brief The shared memory limit per block config for a kernel
+ * This should be used to set 'cudaOccFuncShmemConfig' field in occupancy calculator API
+ */
+typedef enum  {
+    /** The shared memory limit config is default
+     */
+    CUPTI_FUNC_SHMEM_LIMIT_DEFAULT              = 0x00,
+
+    /** User has opted for a higher dynamic shared memory limit using function attribute
+     * 'cudaFuncAttributeMaxDynamicSharedMemorySize' for runtime API or
+     * CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES for driver API
+     */
+    CUPTI_FUNC_SHMEM_LIMIT_OPTIN                = 0x01,
+
+    CUPTI_FUNC_SHMEM_LIMIT_FORCE_INT            = 0x7fffffff
+} CUpti_FuncShmemLimitConfig;
+
+/**
+ * \brief The activity record for kernel.
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes (deprecated in CUDA 11.8).
+   * Refer field localMemoryTotal_v2
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+
+  /**
+   * The ID of the HW channel on which the kernel is launched.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   * The X-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterX;
+
+  /**
+   * The Y-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterY;
+
+  /**
+   * The Z-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterZ;
+
+  /**
+   * The cluster scheduling policy for the kernel. Refer CUclusterSchedulingPolicy
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterSchedulingPolicy;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint64_t localMemoryTotal_v2;
+
+  /**
+   * The maximum cluster size for the kernel
+   */
+  uint32_t maxPotentialClusterSize;
+
+  /**
+   * The maximum clusters that could co-exist on the target device for the kernel
+   */
+  uint32_t maxActiveClusters;
+} CUpti_ActivityKernel9;
+
+/**
+ * \brief The activity record for CDP (CUDA Dynamic Parallelism)
+ * kernel.
+ *
+ * This activity record represents a CDP kernel execution.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CDP_KERNEL
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel execution
+   * is assigned a unique grid ID.
+   */
+  int64_t gridId;
+
+  /**
+   * The grid ID of the parent kernel.
+   */
+  int64_t parentGridId;
+
+  /**
+   * The timestamp when kernel is queued up, in ns. A value of
+   * CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time is
+   * unknown.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when kernel is submitted to the gpu, in ns. A value
+   * of CUPTI_TIMESTAMP_UNKNOWN indicates that the submission time is
+   * unknown.
+   */
+  uint64_t submitted;
+
+  /**
+   * The timestamp when kernel is marked as completed, in ns. A value
+   * of CUPTI_TIMESTAMP_UNKNOWN indicates that the completion time is
+   * unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The X-dimension of the parent block.
+   */
+  uint32_t parentBlockX;
+
+  /**
+   * The Y-dimension of the parent block.
+   */
+  uint32_t parentBlockY;
+
+  /**
+   * The Z-dimension of the parent block.
+   */
+  uint32_t parentBlockZ;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityCdpKernel;
+
+/**
+ * \brief The activity record for a preemption of a CDP kernel.
+ *
+ * This activity record represents a preemption of a CDP kernel.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PREEMPTION
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+  * kind of the preemption
+  */
+  CUpti_ActivityPreemptionKind preemptionKind;
+
+  /**
+   * The timestamp of the preemption, in ns. A value of 0 indicates
+   * that timestamp information could not be collected for the
+   * preemption.
+   */
+  uint64_t timestamp;
+
+  /**
+  * The grid-id of the block that is preempted
+  */
+  int64_t gridId;
+
+  /**
+   * The X-dimension of the block that is preempted
+   */
+  uint32_t blockX;
+
+  /**
+   * The Y-dimension of the block that is preempted
+   */
+  uint32_t blockY;
+
+  /**
+   * The Z-dimension of the block that is preempted
+   */
+  uint32_t blockZ;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityPreemption;
+
+/**
+ * \brief The activity record for a driver or runtime API invocation.
+ *
+ * This activity record represents an invocation of a driver or
+ * runtime API (CUPTI_ACTIVITY_KIND_DRIVER and
+ * CUPTI_ACTIVITY_KIND_RUNTIME).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DRIVER,
+   * CUPTI_ACTIVITY_KIND_RUNTIME, or CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the driver or runtime function.
+   */
+  CUpti_CallbackId cbid;
+
+  /**
+   * The start timestamp for the function, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the function.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the function, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the function.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the process where the driver or runtime CUDA function
+   * is executing.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the thread where the driver or runtime CUDA function is
+   * executing.
+   */
+  uint32_t threadId;
+
+  /**
+   * The correlation ID of the driver or runtime CUDA function. Each
+   * function invocation is assigned a unique correlation ID that is
+   * identical to the correlation ID in the memcpy, memset, or kernel
+   * activity record that is associated with this function.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The return value for the function. For a CUDA driver function
+   * with will be a CUresult value, and for a CUDA runtime function
+   * this will be a cudaError_t value.
+   */
+  uint32_t returnValue;
+} CUpti_ActivityAPI;
+
+/**
+ * \brief The activity record for a CUPTI event.
+ *
+ * This activity record represents a CUPTI event value
+ * (CUPTI_ACTIVITY_KIND_EVENT). This activity record kind is not
+ * produced by the activity API but is included for completeness and
+ * ease-of-use. Profile frameworks built on top of CUPTI that collect
+ * event data may choose to use this type to store the collected event
+ * data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_EVENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The event domain ID.
+   */
+  CUpti_EventDomainID domain;
+
+  /**
+   * The correlation ID of the event. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the event was gathered.
+   */
+  uint32_t correlationId;
+} CUpti_ActivityEvent;
+
+/**
+ * \brief The activity record for a CUPTI event with instance
+ * information.
+ *
+ * This activity record represents the a CUPTI event value for a
+ * specific event domain instance
+ * (CUPTI_ACTIVITY_KIND_EVENT_INSTANCE). This activity record kind is
+ * not produced by the activity API but is included for completeness
+ * and ease-of-use. Profile frameworks built on top of CUPTI that
+ * collect event data may choose to use this type to store the
+ * collected event data. This activity record should be used when
+ * event domain instance information needs to be associated with the
+ * event.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_EVENT_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event domain ID.
+   */
+  CUpti_EventDomainID domain;
+
+  /**
+   * The event domain instance.
+   */
+  uint32_t instance;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The correlation ID of the event. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the event was gathered.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityEventInstance;
+
+/**
+ * \brief The activity record for a CUPTI metric.
+ *
+ * This activity record represents the collection of a CUPTI metric
+ * value (CUPTI_ACTIVITY_KIND_METRIC). This activity record kind is not
+ * produced by the activity API but is included for completeness and
+ * ease-of-use. Profile frameworks built on top of CUPTI that collect
+ * metric data may choose to use this type to store the collected metric
+ * data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_METRIC.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The correlation ID of the metric. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the metric was gathered.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t pad[3];
+} CUpti_ActivityMetric;
+
+/**
+ * \brief The activity record for a CUPTI metric with instance
+ * information.
+ *
+ * This activity record represents a CUPTI metric value
+ * for a specific metric domain instance
+ * (CUPTI_ACTIVITY_KIND_METRIC_INSTANCE).  This activity record kind
+ * is not produced by the activity API but is included for
+ * completeness and ease-of-use. Profile frameworks built on top of
+ * CUPTI that collect metric data may choose to use this type to store
+ * the collected metric data. This activity record should be used when
+ * metric domain instance information needs to be associated with the
+ * metric.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_METRIC_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The metric domain instance.
+   */
+  uint32_t instance;
+
+  /**
+   * The correlation ID of the metric. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the metric was gathered.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t pad[7];
+} CUpti_ActivityMetricInstance;
+
+/**
+ * \brief The activity record for source locator.
+ *
+ * This activity record represents a source locator
+ * (CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID for the source path, will be used in all the source level
+   * results.
+   */
+  uint32_t id;
+
+  /**
+   * The line number in the source .
+   */
+  uint32_t lineNumber;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The path for the file.
+   */
+  const char *fileName;
+} CUpti_ActivitySourceLocator;
+
+/**
+ * \brief The activity record for source-level global
+ * access.
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint64_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number of
+   * threads that executed this instruction with predicate and condition code
+   * evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this
+     access
+   */
+  uint64_t l2_transactions;
+
+  /**
+   * The minimum number of L2 transactions possible based on the access pattern.
+   */
+  uint64_t theoreticalL2Transactions;
+} CUpti_ActivityGlobalAccess3;
+
+/**
+ * \brief The activity record for source level result
+ * branch.
+ *
+ * This activity record the locations of the branches in the
+ * source (CUPTI_ACTIVITY_KIND_BRANCH).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_BRANCH.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the branch.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times this branch diverged
+   */
+  uint32_t diverged;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * regardless of predicate or condition code.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityBranch2;
+
+/**
+ * \brief The activity record for a device. (CUDA 11.6 onwards)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Flag to indicate whether the device is visible to CUDA. Users can
+   * set the device visibility using CUDA_VISIBLE_DEVICES environment
+   */
+  uint8_t isCudaVisible;
+
+  /**
+   * MIG enabled flag for device
+   */
+  uint8_t isMigEnabled;
+
+  uint8_t reserved[6];
+
+  /**
+   * GPU Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t gpuInstanceId;
+
+  /**
+   * Compute Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t computeInstanceId;
+
+  /**
+   * The MIG UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid migUuid;
+
+  /**
+   * Numa (Non-uniform memory access) information for device
+   * GPU is a NUMA node or not
+  */
+  uint32_t isNumaNode;
+
+  /**
+   * Numa (Non-uniform memory access) information for device
+   * NUMA node ID of the GPU memory
+   * if GPU is not a NUMA node, it returns invalidNumaId
+  */
+  uint32_t numaId;
+} CUpti_ActivityDevice5;
+
+/**
+ * \brief The activity record for a device attribute.
+ *
+ * This activity record represents information about a GPU device:
+ * either a CUpti_DeviceAttribute or CUdevice_attribute value
+ * (CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID of the device that this attribute applies to.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The attribute, either a CUpti_DeviceAttribute or
+   * CUdevice_attribute. Flag
+   * CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE is used to indicate
+   * what kind of attribute this is. If
+   * CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE is 1 then
+   * CUdevice_attribute field is value, otherwise
+   * CUpti_DeviceAttribute field is valid.
+   */
+  union {
+    CUdevice_attribute cu;
+    CUpti_DeviceAttribute cupti;
+  } attribute;
+
+  /**
+   * The value for the attribute. See CUpti_DeviceAttribute and
+   * CUdevice_attribute for the type of the value for a given
+   * attribute.
+   */
+  union {
+    double vDouble;
+    uint32_t vUint32;
+    uint64_t vUint64;
+    int32_t vInt32;
+    int64_t vInt64;
+  } value;
+} CUpti_ActivityDeviceAttribute;
+
+/**
+ * \brief The activity record for a context.
+ *
+ * This activity record represents information about a context
+ * (CUPTI_ACTIVITY_KIND_CONTEXT).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CONTEXT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The context ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The compute API kind. \see CUpti_ActivityComputeApiKind
+   */
+  uint16_t computeApiKind;
+
+  /**
+   * The ID for the NULL stream in this context
+   */
+  uint16_t nullStreamId;
+
+  /**
+   * The ID of the parent context. It would be 0 if
+   * context does not have parent
+   */
+  uint32_t parentContextId;
+
+  /**
+   * This field indicates whether the context is a green context
+   */
+  uint8_t isGreenContext;
+
+  uint8_t padding;
+
+  /**
+   * Number of multiprocessors assigned to the green context
+   * Invalid if the field 'isGreenContext' is 0
+   */
+  uint16_t numMultiprocessors;
+
+  /**
+   * This field indicates the CIG mode
+   */
+  CUpti_ContextCigMode cigMode;
+
+  uint32_t padding2;
+
+} CUpti_ActivityContext3;
+
+/**
+ * \brief The activity record providing a name.
+ *
+ * This activity record provides a name for a device, context, thread,
+ * etc. and other resource naming done via NVTX APIs
+ * (CUPTI_ACTIVITY_KIND_NAME).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NAME.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of activity object being named.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The name.
+   */
+  const char *name;
+
+} CUpti_ActivityName;
+
+/**
+ * \brief The activity record providing a marker which is an
+ * instantaneous point in time.
+ *
+ * The marker is specified with a descriptive name and unique id
+ * (CUPTI_ACTIVITY_KIND_MARKER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the marker. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The timestamp for the marker, in ns. A value of 0 indicates that
+   * timestamp information could not be collected for the marker.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The marker ID.
+   */
+  uint32_t id;
+
+  /**
+   * The kind of activity object associated with this marker.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object associated with this
+   * marker. 'objectKind' indicates which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+
+
+  /**
+   * The marker name for an instantaneous or start marker. This will
+   * be NULL for an end marker.
+   */
+  const char *name;
+
+  /**
+   * The name of the domain to which this marker belongs to.
+   * This will be NULL for default domain.
+   */
+  const char *domain;
+
+} CUpti_ActivityMarker2;
+
+/**
+ * \brief The activity record providing detailed information for a marker.
+ *
+ * User must enable CUPTI_ACTIVITY_KIND_MARKER as well
+ * to get records for marker data.
+ * The marker data contains color, payload, and category.
+ * (CUPTI_ACTIVITY_KIND_MARKER_DATA).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_MARKER_DATA.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the marker. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The marker ID.
+   */
+  uint32_t id;
+
+  /**
+   * Defines the payload format for the value associated with the marker.
+   */
+  CUpti_MetricValueKind payloadKind;
+
+  /**
+   * The payload value.
+   */
+  CUpti_MetricValue payload;
+
+  /**
+   * The color for the marker.
+   */
+  uint32_t color;
+
+  /**
+   * The category for the marker.
+   */
+  uint32_t category;
+
+} CUpti_ActivityMarkerData;
+
+/**
+ * \brief The activity record for CUPTI and driver overheads.
+ *
+ * This activity record provides CUPTI and driver overhead information
+ * (CUPTI_ACTIVITY_KIND_OVERHEAD).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of overhead, CUPTI, DRIVER, COMPILER etc.
+   */
+  CUpti_ActivityOverheadKind overheadKind;
+
+  /**
+   * The kind of activity object that the overhead is associated with.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * The start timestamp for the overhead, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the overhead.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the overhead, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the overhead.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the overhead operation to which
+   * records belong to. This ID is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the overhead operation.
+   * In some cases, it can be zero, such as for CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH records.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Reserved for internal use.
+   */
+  uint32_t reserved0;
+
+  /**
+   * Pointer to the struct with additional details about the overhead.
+   * Refer CUpti_ActivityOverheadKind enum and the corresponding structure to typecast and access additional overhead data.
+   * Client is responsible for freeing this memory using the free function when done.
+   */
+  void *overheadData;
+
+} CUpti_ActivityOverhead3;
+
+/**
+ * \brief The activity record for CUPTI environmental data.
+ *
+ * This activity record provides CUPTI environmental data, include
+ * power, clocks, and thermals.  This information is sampled at
+ * various rates and returned in this activity record.  The consumer
+ * of the record needs to check the environmentKind field to figure
+ * out what kind of environmental record this is.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_ENVIRONMENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the device
+   */
+  uint32_t deviceId;
+
+  /**
+   * The timestamp when this sample was retrieved, in ns. A value of 0
+   * indicates that timestamp information could not be collected for
+   * the marker.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The kind of data reported in this record.
+   */
+  CUpti_ActivityEnvironmentKind environmentKind;
+
+  union {
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_SPEED environment
+     * kind.
+     */
+    struct {
+      /**
+       * The SM frequency in MHz
+       */
+      uint32_t smClock;
+
+      /**
+       * The memory frequency in MHz
+       */
+      uint32_t memoryClock;
+
+      /**
+       * The PCIe link generation.
+       */
+      uint32_t pcieLinkGen;
+
+      /**
+       * The PCIe link width.
+       */
+      uint32_t pcieLinkWidth;
+
+      /**
+       * The clocks throttle reasons.
+       */
+      CUpti_EnvironmentClocksThrottleReason clocksThrottleReasons;
+    } speed;
+
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_TEMPERATURE
+     * environment kind.
+     */
+    struct {
+      /**
+       * The GPU temperature in degrees C.
+       */
+      uint32_t gpuTemperature;
+    } temperature;
+
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_POWER environment kind.
+     * The power in milliwatts consumed by GPU and associated circuitry.
+     * The power in milliwatts that will trigger power management algorithm.
+     */
+    struct {
+
+      uint32_t power;
+      uint32_t powerLimit;
+    } power;
+
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_COOLING
+     * environment kind.
+     */
+    struct {
+      /**
+       * The fan speed as percentage of maximum.
+       */
+      uint32_t fanSpeed;
+    } cooling;
+  } data;
+} CUpti_ActivityEnvironment;
+
+/**
+ * \brief The activity record for source-level instruction execution.
+ *
+ * This activity records result for source level instruction execution.
+ * (CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction execution.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction, regardless of predicate or condition code.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t notPredOffThreadsExecuted;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * regardless of predicate or condition code.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityInstructionExecution;
+
+/**
+ * \brief The activity record for PC sampling.
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * These samples indicate that no instruction was issued in that cycle from
+   * the warp scheduler from where the warp was sampled.
+   * Field is valid for devices with compute capability 6.0 and higher
+   */
+  uint32_t latencySamples;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons. The count includes
+   * latencySamples.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint64_t pcOffset;
+} CUpti_ActivityPCSampling3;
+
+/**
+ * \brief The activity record for record status for PC sampling.
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Number of times the PC was sampled for this kernel instance including all
+   * dropped samples.
+   */
+  uint64_t totalSamples;
+
+  /**
+   * Number of samples that were dropped by hardware due to backpressure/overflow.
+   */
+  uint64_t droppedSamples;
+  /**
+   * Sampling period in terms of number of cycles .
+   */
+  uint64_t samplingPeriodInCycles;
+} CUpti_ActivityPCSamplingRecordInfo;
+
+/**
+ * \brief The activity record for Unified Memory counters (CUDA 7.0 and beyond)
+ *
+ * This activity record represents a Unified Memory counter
+ * (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The Unified Memory counter kind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind counterKind;
+
+  /**
+   * Value of the counter
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD,
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH,
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THREASHING and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP, it is the size of the
+   * memory region in bytes.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, it
+   * is the number of page fault groups for the same page.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT,
+   * it is the program counter for the instruction that caused fault.
+   */
+  uint64_t value;
+
+  /**
+   * The start timestamp of the counter, in ns.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is
+   * captured when activity starts on GPU.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT, timestamp is
+   * captured when CUDA driver started processing the fault.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, timestamp
+   * is captured when CUDA driver detected thrashing of memory region.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING,
+   * timestamp is captured when throttling operation was started by CUDA driver.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP,
+   * timestamp is captured when CUDA driver has pushed all required operations
+   * to the processor specified by dstId.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp of the counter, in ns.
+   * Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is
+   * captured when activity finishes on GPU.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, timestamp is
+   * captured when CUDA driver queues the replay of faulting memory accesses on the GPU
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING, timestamp
+   * is captured when throttling operation was finished by CUDA driver
+   */
+  uint64_t end;
+
+  /**
+   * This is the virtual base address of the page/s being transferred. For cpu and
+   * gpu faults, the virtual address for the page that faulted.
+   */
+  uint64_t address;
+
+  /**
+   * The ID of the source CPU/device involved in the memory transfer, page fault, thrashing,
+   * throttling or remote map operation. For counterKind
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, it is a bitwise ORing of the
+   * device IDs fighting for the memory region, ONLY if there are less than 32 devices. Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT
+   */
+  uint32_t srcId;
+
+  /**
+   * The ID of the destination CPU/device involved in the memory transfer or remote map
+   * operation. Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING
+   */
+  uint32_t dstId;
+
+  /**
+   * The ID of the stream causing the transfer.
+   * This value of this field is invalid.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The flags associated with this record. See enums \ref CUpti_ActivityUnifiedMemoryAccessType
+   * if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT
+   * and \ref CUpti_ActivityUnifiedMemoryMigrationCause if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD
+   * and \ref CUpti_ActivityUnifiedMemoryRemoteMapCause if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP and \ref CUpti_ActivityFlag
+   * if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING
+   */
+  uint32_t flags;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+
+  /**
+   * \brief The bitmask of devices involved in the operation.
+   *
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, it is a bitwise ORing of the
+   * device IDs fighting for the memory region. processors[0] represents the device ID of the device 0 to device 63,
+   * processors[1] represents device ID of device 64 to device 127 and so on.
+   * Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_DTOD or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_FAULT_REPLAY
+   */
+  uint64_t processors[5];
+} CUpti_ActivityUnifiedMemoryCounter3;
+
+/**
+ * \brief The activity record for global/device functions.
+ *
+ * This activity records function name and corresponding module
+ * information.
+ * (CUPTI_ACTIVITY_KIND_FUNCTION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_FUNCTION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+  * ID to uniquely identify the record
+  */
+  uint32_t id;
+
+  /**
+   * The ID of the context where the function is launched.
+   */
+  uint32_t contextId;
+
+  /**
+   * The module ID in which this global/device function is present.
+   */
+  uint32_t moduleId;
+
+  /**
+   * The function's unique symbol index in the module.
+   */
+  uint32_t functionIndex;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The name of the function. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityFunction;
+
+/**
+ * \brief The activity record for a CUDA module.
+ *
+ * This activity record represents a CUDA module
+ * (CUPTI_ACTIVITY_KIND_MODULE). This activity record kind is not
+ * produced by the activity API but is included for completeness and
+ * ease-of-use. Profile frameworks built on top of CUPTI that collect
+ * module data from the module callback may choose to use this type to
+ * store the collected module data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MODULE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the context where the module is loaded.
+   */
+  uint32_t contextId;
+
+  /**
+   * The module ID.
+   */
+  uint32_t id;
+
+  /**
+   * The cubin size.
+   */
+  uint32_t cubinSize;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The pointer to cubin.
+   */
+  const void *cubin;
+} CUpti_ActivityModule;
+
+/**
+ * \brief The activity record for source-level shared
+ * access.
+ *
+ * This activity records the locations of the shared
+ * accesses in the source
+ * (CUPTI_ACTIVITY_KIND_SHARED_ACCESS).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_SHARED_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this shared access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+ /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of shared memory transactions generated by this access
+   */
+  uint64_t sharedTransactions;
+
+  /**
+   * The minimum number of shared memory transactions possible based on the access pattern.
+   */
+  uint64_t theoreticalSharedTransactions;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivitySharedAccess;
+
+/**
+ * \brief The activity record for CUDA event.
+ *
+ * This activity is used to track recorded events.
+ * (CUPTI_ACTIVITY_KIND_CUDA_EVENT).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CUDA_EVENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the context where the event was recorded.
+   */
+  uint32_t contextId;
+
+  /**
+   * The compute stream where the event was recorded.
+   */
+  uint32_t streamId;
+
+  /**
+   * A unique event ID to identify the event record.
+   */
+  uint32_t eventId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+
+  /**
+   * The ID of the device where the event was recorded.
+   */
+  uint32_t deviceId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad2;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The device-side timestamp on CUDA event record.
+   * Timestamp is in nanoseconds.
+   */
+  uint64_t deviceTimestamp;
+  /**
+   * A unique ID to associate event synchronization records
+   * with the latest CUDA Event record. Similar field is added
+   * in CUpti_ActivitySynchronization2 to associate CUDA Event
+   * record to the synchronization record.
+   *
+   * The same CUDA event can be used multiple times, so the
+   * event id will not be unique to correlate the synchronization
+   * record with the latest CUDA Event record.
+   * This field will be unique and can be used to do the required
+   * correlation.
+   */
+  uint64_t cudaEventSyncId;
+} CUpti_ActivityCudaEvent2;
+
+/**
+ * \brief The activity record for CUDA stream.
+ *
+ * This activity is used to track created streams.
+ * (CUPTI_ACTIVITY_KIND_STREAM).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_STREAM.
+   */
+  CUpti_ActivityKind kind;
+  /**
+   * The ID of the context where the stream was created.
+   */
+  uint32_t contextId;
+
+  /**
+   * A unique stream ID to identify the stream.
+   */
+  uint32_t streamId;
+
+  /**
+   * The clamped priority for the stream.
+   */
+  uint32_t priority;
+
+  /**
+   * Flags associated with the stream.
+   */
+  CUpti_ActivityStreamFlag flag;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+} CUpti_ActivityStream;
+
+/**
+ * \brief The activity record for synchronization management.
+ *
+ * This activity is used to track various CUDA synchronization APIs.
+ * (CUPTI_ACTIVITY_KIND_SYNCHRONIZATION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_SYNCHRONIZATION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The type of record.
+   */
+  CUpti_ActivitySynchronizationType type;
+
+  /**
+   * The start timestamp for the function, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the function.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the function, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the function.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the context for which the synchronization API is called.
+   * In case of context synchronization API it is the context id for which the API is called.
+   * In case of stream/event synchronization it is the ID of the context where the stream/event was created.
+   */
+  uint32_t contextId;
+
+  /**
+   * The compute stream for which the synchronization API is called.
+   * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record.
+   * Not valid for cuCtxSynchronize, cuEventSynchronize.
+   */
+  uint32_t streamId;
+
+  /**
+   * The event ID for which the synchronization API is called.
+   * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record.
+   * Not valid for cuCtxSynchronize, cuStreamSynchronize.
+   */
+  uint32_t cudaEventId;
+
+  /**
+   * A unique ID to associate event synchronization records
+   * with the latest CUDA Event record. Similar field is added
+   * in CUpti_ActivityCudaEvent2 to associate synchronization
+   * record to the CUDA Event record.
+   *
+   * The same CUDA event can be used multiple times, so the
+   * event id will not be unique to correlate the synchronization
+   * record with the latest CUDA Event record.
+   * This field will be unique and can be used to do the required
+   * correlation.
+   *
+   * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicates that
+   * the field is not applicable for this record.
+   * Valid only for synchronization records related to CUDA Events.
+   */
+  uint64_t cudaEventSyncId;
+
+  /**
+   * The return value for the synchronization record.
+   * Use cuptiActivityEnableAllSyncRecords API to enable/disable
+   * collection of synchronization records with return value being
+   * non-zero. This will be a CUresult value.
+   */
+  uint32_t returnValue;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivitySynchronization2;
+
+/**
+ * \brief The activity record for source-level sass/source
+ * line-by-line correlation.
+ *
+ * This activity records source level sass/source correlation
+ * information.
+ * (CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+ /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityInstructionCorrelation;
+
+/**
+ * \brief The OpenAcc event kind for OpenAcc activity records.
+ *
+ * \see CUpti_ActivityKindOpenAcc
+ */
+typedef enum {
+  CUPTI_OPENACC_EVENT_KIND_INVALID              = 0,
+  CUPTI_OPENACC_EVENT_KIND_DEVICE_INIT          = 1,
+  CUPTI_OPENACC_EVENT_KIND_DEVICE_SHUTDOWN      = 2,
+  CUPTI_OPENACC_EVENT_KIND_RUNTIME_SHUTDOWN     = 3,
+  CUPTI_OPENACC_EVENT_KIND_ENQUEUE_LAUNCH       = 4,
+  CUPTI_OPENACC_EVENT_KIND_ENQUEUE_UPLOAD       = 5,
+  CUPTI_OPENACC_EVENT_KIND_ENQUEUE_DOWNLOAD     = 6,
+  CUPTI_OPENACC_EVENT_KIND_WAIT                 = 7,
+  CUPTI_OPENACC_EVENT_KIND_IMPLICIT_WAIT        = 8,
+  CUPTI_OPENACC_EVENT_KIND_COMPUTE_CONSTRUCT    = 9,
+  CUPTI_OPENACC_EVENT_KIND_UPDATE               = 10,
+  CUPTI_OPENACC_EVENT_KIND_ENTER_DATA           = 11,
+  CUPTI_OPENACC_EVENT_KIND_EXIT_DATA            = 12,
+  CUPTI_OPENACC_EVENT_KIND_CREATE               = 13,
+  CUPTI_OPENACC_EVENT_KIND_DELETE               = 14,
+  CUPTI_OPENACC_EVENT_KIND_ALLOC                = 15,
+  CUPTI_OPENACC_EVENT_KIND_FREE                 = 16,
+  CUPTI_OPENACC_EVENT_KIND_FORCE_INT            = 0x7fffffff
+} CUpti_OpenAccEventKind;
+
+/**
+ * \brief The OpenAcc parent construct kind for OpenAcc activity records.
+ */
+typedef enum {
+  CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN          = 0,
+  CUPTI_OPENACC_CONSTRUCT_KIND_PARALLEL         = 1,
+  CUPTI_OPENACC_CONSTRUCT_KIND_KERNELS          = 2,
+  CUPTI_OPENACC_CONSTRUCT_KIND_LOOP             = 3,
+  CUPTI_OPENACC_CONSTRUCT_KIND_DATA             = 4,
+  CUPTI_OPENACC_CONSTRUCT_KIND_ENTER_DATA       = 5,
+  CUPTI_OPENACC_CONSTRUCT_KIND_EXIT_DATA        = 6,
+  CUPTI_OPENACC_CONSTRUCT_KIND_HOST_DATA        = 7,
+  CUPTI_OPENACC_CONSTRUCT_KIND_ATOMIC           = 8,
+  CUPTI_OPENACC_CONSTRUCT_KIND_DECLARE          = 9,
+  CUPTI_OPENACC_CONSTRUCT_KIND_INIT             = 10,
+  CUPTI_OPENACC_CONSTRUCT_KIND_SHUTDOWN         = 11,
+  CUPTI_OPENACC_CONSTRUCT_KIND_SET              = 12,
+  CUPTI_OPENACC_CONSTRUCT_KIND_UPDATE           = 13,
+  CUPTI_OPENACC_CONSTRUCT_KIND_ROUTINE          = 14,
+  CUPTI_OPENACC_CONSTRUCT_KIND_WAIT             = 15,
+  CUPTI_OPENACC_CONSTRUCT_KIND_RUNTIME_API      = 16,
+  CUPTI_OPENACC_CONSTRUCT_KIND_FORCE_INT        = 0x7fffffff
+
+} CUpti_OpenAccConstructKind;
+
+typedef enum {
+  CUPTI_OPENMP_EVENT_KIND_INVALID               = 0,
+  CUPTI_OPENMP_EVENT_KIND_PARALLEL              = 1,
+  CUPTI_OPENMP_EVENT_KIND_TASK                  = 2,
+  CUPTI_OPENMP_EVENT_KIND_THREAD                = 3,
+  CUPTI_OPENMP_EVENT_KIND_IDLE                  = 4,
+  CUPTI_OPENMP_EVENT_KIND_WAIT_BARRIER          = 5,
+  CUPTI_OPENMP_EVENT_KIND_WAIT_TASKWAIT         = 6,
+  CUPTI_OPENMP_EVENT_KIND_FORCE_INT             = 0x7fffffff
+} CUpti_OpenMpEventKind;
+
+/**
+ * \brief The base activity record for OpenAcc records.
+ *
+ * The OpenACC activity API part uses a CUpti_ActivityOpenAcc as a generic
+ * representation for any OpenACC activity. The 'kind' field is used to determine the
+ * specific activity kind, and from that the CUpti_ActivityOpenAcc object can
+ * be cast to the specific OpenACC activity record type appropriate for that kind.
+ *
+ * Note that all OpenACC activity record types are padded and aligned to
+ * ensure that each member of the record is naturally aligned.
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /**
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /**
+   * Version number
+   */
+  uint32_t version;
+
+  /**
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /**
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /**
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /**
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /**
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /**
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /**
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /**
+   * The line number of the first line of the function named in funcName.
+   * A zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /**
+   * The last line number of the function named in funcName.
+   * A zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /*
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /*
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+} CUpti_ActivityOpenAcc;
+
+/**
+ * \brief The activity record for OpenACC data.
+ *
+ * (CUPTI_ACTIVITY_KIND_OPENACC_DATA).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_DATA.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /*
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /*
+   * Version number
+   */
+  uint32_t version;
+
+  /*
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /*
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /*
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /*
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /*
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /*
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /*
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A negative or zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /*
+   * The line number of the first line of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /*
+   * The last line number of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /*
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /*
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+
+  /* --- end of common CUpti_ActivityOpenAcc part --- */
+
+  /**
+   * Number of bytes
+   */
+  uint64_t bytes;
+
+  /**
+   * Host pointer if available
+   */
+  uint64_t hostPtr;
+
+  /**
+   * Device pointer if available
+   */
+  uint64_t devicePtr;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /*
+   * A pointer to null-terminated string containing the name of the variable
+   * for which this event is triggered, if known, or a null pointer if not.
+   */
+  const char *varName;
+
+} CUpti_ActivityOpenAccData;
+
+/**
+ * \brief The activity record for OpenACC launch.
+ *
+ * (CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /**
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /**
+   * Version number
+   */
+  uint32_t version;
+
+  /**
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /**
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /**
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /**
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /**
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /**
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /**
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A negative or zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /**
+   * The line number of the first line of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /**
+   * The last line number of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /**
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /**
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+
+  /* --- end of common CUpti_ActivityOpenAcc part --- */
+
+  /**
+   * The number of gangs created for this kernel launch
+   */
+  uint64_t numGangs;
+
+  /**
+   * The number of workers created for this kernel launch
+   */
+  uint64_t numWorkers;
+
+  /**
+   * The number of vector lanes created for this kernel launch
+   */
+  uint64_t vectorLength;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /**
+   * A pointer to null-terminated string containing the name of the
+   * kernel being launched, if known, or a null pointer if not.
+   */
+  const char *kernelName;
+
+} CUpti_ActivityOpenAccLaunch;
+
+/**
+ * \brief The activity record for OpenACC other.
+ *
+ * (CUPTI_ACTIVITY_KIND_OPENACC_OTHER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_OTHER.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /**
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /**
+   * Version number
+   */
+  uint32_t version;
+
+  /**
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /**
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /**
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /**
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /**
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /**
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /**
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A negative or zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /**
+   * The line number of the first line of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /**
+   * The last line number of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /**
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /**
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+
+  /* --- end of common CUpti_ActivityOpenAcc part --- */
+} CUpti_ActivityOpenAccOther;
+
+/**
+ * \brief The base activity record for OpenMp records.
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenMP event kind (\see CUpti_OpenMpEventKind)
+   */
+  CUpti_OpenMpEventKind eventKind;
+
+  /**
+   * Version number
+   */
+  uint32_t version;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the process where the OpenMP activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenMP activity is executing.
+   */
+  uint32_t cuThreadId;
+} CUpti_ActivityOpenMp;
+
+/**
+ * \brief The kind of external APIs supported for correlation.
+ *
+ * Custom correlation kinds are reserved for usage in external tools.
+ *
+ * \see CUpti_ActivityExternalCorrelation
+ */
+typedef enum {
+    CUPTI_EXTERNAL_CORRELATION_KIND_INVALID              = 0,
+
+    /**
+     * The external API is unknown to CUPTI
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN              = 1,
+
+    /**
+     * The external API is OpenACC
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC              = 2,
+
+    /**
+     * The external API is custom0
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0              = 3,
+
+    /**
+     * The external API is custom1
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1              = 4,
+
+    /**
+     * The external API is custom2
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM2              = 5,
+
+    /**
+     * Add new kinds before this line
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_SIZE,
+
+    CUPTI_EXTERNAL_CORRELATION_KIND_FORCE_INT            = 0x7fffffff
+} CUpti_ExternalCorrelationKind;
+
+/**
+ * \brief The activity record for correlation with external records
+ *
+ * This activity record correlates native CUDA records (e.g. CUDA Driver API,
+ * kernels, memcpys, ...) with records from external APIs such as OpenACC.
+ * (CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION).
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of external API this record correlated to.
+   */
+  CUpti_ExternalCorrelationKind externalKind;
+
+  /**
+   * The correlation ID of the associated non-CUDA API record.
+   * The exact field in the associated external record depends
+   * on that record's activity kind (\see externalKind).
+   */
+  uint64_t externalId;
+
+  /**
+   * The correlation ID of the associated CUDA driver or runtime API record.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t reserved;
+} CUpti_ActivityExternalCorrelation;
+
+/**
+* \brief The device type for device connected to NVLink.
+*/
+typedef enum {
+    CUPTI_DEV_TYPE_INVALID = 0,
+
+    /**
+    * The device type is GPU.
+    */
+    CUPTI_DEV_TYPE_GPU = 1,
+
+    /**
+    * The device type is NVLink processing unit in CPU.
+    */
+    CUPTI_DEV_TYPE_NPU = 2,
+
+    CUPTI_DEV_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_DevType;
+
+/**
+* \brief NVLink information.
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+*/
+
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * NvLink version.
+   */
+  uint32_t nvlinkVersion;
+
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t  physicalNvLinkCount;
+
+  /**
+   * Port numbers for maximum 32 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t  portDev0[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Port numbers for maximum 32 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t  portDev1[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Bandwidth of NVLink in kbytes/sec
+   */
+  uint64_t  bandwidth;
+
+  /**
+   * NVSwitch is connected as an intermediate node.
+   */
+  uint8_t nvswitchConnected;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[7];
+} CUpti_ActivityNvLink4;
+
+#define CUPTI_MAX_GPUS 32
+/**
+ * Field to differentiate whether PCIE Activity record
+ * is of a GPU or a PCI Bridge
+ */
+typedef enum {
+    /**
+     * PCIE GPU record
+     */
+    CUPTI_PCIE_DEVICE_TYPE_GPU       = 0,
+
+    /**
+     * PCIE Bridge record
+     */
+    CUPTI_PCIE_DEVICE_TYPE_BRIDGE    = 1,
+
+    CUPTI_PCIE_DEVICE_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_PcieDeviceType;
+
+/**
+ * \brief PCI devices information required to construct topology
+ *
+ * This structure gives capabilities of GPU and PCI bridge connected to the PCIE bus
+ * which can be used to understand the topology.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PCIE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * Type of device in topology, \ref CUpti_PcieDeviceType. If type is
+   * CUPTI_PCIE_DEVICE_TYPE_GPU use devId for id and gpuAttr and if type is
+   * CUPTI_PCIE_DEVICE_TYPE_BRIDGE use bridgeId for id and bridgeAttr.
+   */
+  CUpti_PcieDeviceType type;
+
+  /**
+   * A unique identifier for GPU or Bridge in Topology
+   */
+  union {
+    /**
+     * GPU device ID
+     */
+    CUdevice devId;
+
+    /**
+     * A unique identifier for Bridge in the Topology
+     */
+    uint32_t bridgeId;
+  } id;
+
+  /**
+   * Domain for the GPU or Bridge, required to identify which PCIE bus it belongs to in
+   * multiple NUMA systems.
+   */
+  uint32_t domain;
+
+  /**
+   * PCIE Generation of GPU or Bridge.
+   */
+  uint16_t pcieGeneration;
+
+  /**
+   * Link rate of the GPU or bridge in gigatransfers per second (GT/s)
+   */
+  uint16_t linkRate;
+
+  /**
+   * Link width of the GPU or bridge
+   */
+  uint16_t linkWidth;
+
+  /**
+   * Upstream bus ID for the GPU or PCI bridge. Required to identify which bus it is
+   * connected to in the topology.
+   */
+  uint16_t upstreamBus;
+
+  /**
+   * Attributes for more information about GPU (gpuAttr) or PCI Bridge (bridgeAttr)
+   */
+  union {
+    struct {
+      /**
+       * UUID for the device. \ref CUpti_ActivityDevice5.
+       */
+      CUuuid uuidDev;
+
+      /**
+       * CUdevice with which this device has P2P capability.
+       * This can also be obtained by querying cuDeviceCanAccessPeer or
+       * cudaDeviceCanAccessPeer APIs
+       */
+      CUdevice peerDev[CUPTI_MAX_GPUS];
+    } gpuAttr;
+
+    struct {
+      /**
+       * The downstream bus number, used to search downstream devices/bridges connected
+       * to this bridge.
+       */
+      uint16_t secondaryBus;
+
+      /**
+       * Device ID of the bridge
+       */
+      uint16_t deviceId;
+
+      /**
+       * Vendor ID of the bridge
+       */
+      uint16_t vendorId;
+
+      /**
+       * Padding for alignment
+       */
+      uint16_t pad0;
+    } bridgeAttr;
+  } attr;
+} CUpti_ActivityPcie;
+
+/**
+ * \brief PCIE Generation.
+ *
+ * Enumeration of PCIE Generation for
+ * pcie activity attribute pcieGeneration
+ */
+typedef enum {
+  /**
+  * PCIE Generation 1
+  */
+  CUPTI_PCIE_GEN_GEN1       = 1,
+
+  /**
+  * PCIE Generation 2
+  */
+  CUPTI_PCIE_GEN_GEN2       = 2,
+
+  /**
+  * PCIE Generation 3
+  */
+  CUPTI_PCIE_GEN_GEN3       = 3,
+
+  /**
+  * PCIE Generation 4
+  */
+  CUPTI_PCIE_GEN_GEN4       = 4,
+
+  /**
+  * PCIE Generation 5
+  */
+  CUPTI_PCIE_GEN_GEN5       = 5,
+
+  /**
+  * PCIE Generation 6
+  */
+  CUPTI_PCIE_GEN_GEN6       = 6,
+
+  CUPTI_PCIE_GEN_FORCE_INT  = 0x7fffffff
+} CUpti_PcieGen;
+
+
+/**
+ * \brief The activity record for an instantaneous CUPTI event.
+ *
+ * This activity record represents a CUPTI event value
+ * (CUPTI_ACTIVITY_KIND_EVENT) sampled at a particular instant.
+ * This activity record kind is not produced by the activity API but is
+ * included for completeness and ease-of-use. Profiler frameworks built on
+ * top of CUPTI that collect event data at a particular time may choose to
+ * use this type to store the collected event data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The timestamp at which event is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint32_t reserved;
+} CUpti_ActivityInstantaneousEvent;
+
+/**
+ * \brief The activity record for an instantaneous CUPTI event
+ * with event domain instance information.
+ *
+ * This activity record represents the a CUPTI event value for a
+ * specific event domain instance
+ * (CUPTI_ACTIVITY_KIND_EVENT_INSTANCE) sampled at a particular instant.
+ * This activity record kind is not produced by the activity API but is
+ * included for completeness and ease-of-use. Profiler frameworks built on
+ * top of CUPTI that collect event data may choose to use this type to store the
+ * collected event data. This activity record should be used when
+ * event domain instance information needs to be associated with the
+ * event.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The timestamp at which event is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+
+  /**
+   * The event domain instance
+   */
+  uint8_t instance;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[3];
+} CUpti_ActivityInstantaneousEventInstance;
+
+/**
+ * \brief The activity record for an instantaneous CUPTI metric.
+ *
+ * This activity record represents the collection of a CUPTI metric
+ * value (CUPTI_ACTIVITY_KIND_METRIC) at a particular instance.
+ * This activity record kind is not produced by the activity API but
+ * is included for completeness and ease-of-use. Profiler frameworks built
+ * on top of CUPTI that collect metric data may choose to use this type to
+ * store the collected metric data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The timestamp at which metric is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[3];
+} CUpti_ActivityInstantaneousMetric;
+
+/**
+ * \brief The instantaneous activity record for a CUPTI metric with instance
+ * information.
+
+ * This activity record represents a CUPTI metric value
+ * for a specific metric domain instance
+ * (CUPTI_ACTIVITY_KIND_METRIC_INSTANCE) sampled at a particular time. This
+ * activity record kind is not produced by the activity API but is included for
+ * completeness and ease-of-use. Profiler frameworks built on top of
+ * CUPTI that collect metric data may choose to use this type to store
+ * the collected metric data. This activity record should be used when
+ * metric domain instance information needs to be associated with the
+ * metric.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The timestamp at which metric is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The metric domain instance
+   */
+  uint8_t instance;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[2];
+} CUpti_ActivityInstantaneousMetricInstance;
+
+/**
+ * \brief The types of JIT entry.
+ *
+ * To be used in CUpti_ActivityJit.
+ */
+typedef enum {
+  CUPTI_ACTIVITY_JIT_ENTRY_INVALID= 0,
+
+  /**
+  * PTX to CUBIN.
+  */
+  CUPTI_ACTIVITY_JIT_ENTRY_PTX_TO_CUBIN = 1,
+
+  /**
+  * NVVM-IR to PTX
+  */
+  CUPTI_ACTIVITY_JIT_ENTRY_NVVM_IR_TO_PTX = 2,
+
+  CUPTI_ACTIVITY_JIT_ENTRY_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityJitEntryType;
+
+/**
+ * \brief The types of JIT compilation operations.
+ *
+ * To be used in CUpti_ActivityJit.
+ */
+
+typedef enum {
+  CUPTI_ACTIVITY_JIT_OPERATION_INVALID = 0,
+  /**
+  * Loaded from the compute cache.
+  */
+  CUPTI_ACTIVITY_JIT_OPERATION_CACHE_LOAD = 1,
+
+  /**
+  * Stored in the compute cache.
+  */
+  CUPTI_ACTIVITY_JIT_OPERATION_CACHE_STORE = 2,
+
+  /**
+  * JIT compilation.
+  */
+  CUPTI_ACTIVITY_JIT_OPERATION_COMPILE = 3,
+
+  CUPTI_ACTIVITY_JIT_OPERATION_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityJitOperationType;
+
+/**
+ * \brief The activity record for JIT operations.
+ * This activity represents the JIT operations (compile, load, store) of a CUmodule
+ * from the Compute Cache.
+ * Gives the exact hashed path of where the cached module is loaded from,
+ * or where the module will be stored after Just-In-Time (JIT) compilation.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind must be CUPTI_ACTIVITY_KIND_JIT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+    * The JIT entry type.
+    */
+  CUpti_ActivityJitEntryType jitEntryType;
+
+  /**
+   * The JIT operation type.
+   */
+  CUpti_ActivityJitOperationType jitOperationType;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The start timestamp for the JIT operation, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the JIT operation.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the JIT operation, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the JIT operation.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the JIT operation to which
+   * records belong to. Each JIT operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the JIT operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Internal use.
+   */
+  uint32_t padding;
+
+  /**
+   * The correlation ID to correlate JIT compilation, load and store operations.
+   * Each JIT compilation unit is assigned a unique correlation ID
+   * at the time of the JIT compilation. This correlation id can be used
+   * to find the matching JIT cache load/store records.
+   */
+  uint64_t jitOperationCorrelationId;
+
+  /**
+   * The size of compute cache.
+   */
+  uint64_t cacheSize;
+
+  /**
+   * The path where the fat binary is cached.
+   */
+  const char* cachePath;
+
+  /**
+   * The ID of the process where the JIT operation is executing.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the thread where the JIT operation is executing.
+   */
+  uint32_t threadId;
+} CUpti_ActivityJit2;
+
+
+/**
+ * \brief The activity record for trace of graph execution.
+ *
+ * This activity record represents execution for a graph without giving visibility
+ * about the execution of its nodes. This is intended to reduce overheads in tracing
+ * each node. The activity kind is CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+ */
+typedef struct {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the graph launch. Each graph launch is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the graph.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The start timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the first node of the graph is executed.
+   * If this is INT_MAX, then the start is on the host.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The unique ID of the graph that is launched.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the context where the first node of the graph is executed.
+   * If this is INT_MAX, then the start is on the host.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the graph is being launched.
+   */
+  uint32_t streamId;
+
+  /**
+   * This field is reserved for internal use
+   */
+  void *reserved;
+
+  /**
+   * The ID of the device where last node of the graph is executed
+   */
+  uint32_t endDeviceId;
+
+  /**
+   * The ID of the context where the last node of the graph is executed.
+   */
+  uint32_t endContextId;
+} CUpti_ActivityGraphTrace2;
+
+/**
+ * \brief The launch mode for device graph execution.
+ */
+typedef enum {
+    CUPTI_DEVICE_GRAPH_LAUNCH_MODE_INVALID = 0,
+    CUPTI_DEVICE_GRAPH_LAUNCH_MODE_FIRE_AND_FORGET = 1,
+    CUPTI_DEVICE_GRAPH_LAUNCH_MODE_TAIL = 2,
+    CUPTI_DEVICE_GRAPH_LAUNCH_MODE_FIRE_AND_FORGET_AS_SIBLING = 3,
+} CUpti_DeviceGraphLaunchMode;
+
+/**
+ * \brief The activity record for trace of device graph execution.
+ *
+ * This activity record represents execution for a device launched graph without giving visibility
+ * about the execution of its nodes. This is intended to reduce overheads in tracing
+ * each node. The activity kind is CUPTI_ACTIVITY_KIND_DEVICE_GRAPH_TRACE
+ */
+typedef struct {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE_GRAPH_TRACE
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the device where the first node of the graph is executed.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The start timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t end;
+
+  /**
+   * The unique ID of the graph that is launched.
+   */
+  uint32_t graphId;
+
+  /**
+   * The unique ID of the graph that has launched this graph.
+   */
+  uint32_t launcherGraphId;
+
+  /**
+   * The type of launch. See \ref CUpti_DeviceGraphLaunchMode
+   */
+  uint32_t deviceLaunchMode;
+
+  /**
+   * The ID of the context where the first node of the graph is executed.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the graph is being launched.
+   */
+  uint64_t streamId;
+
+  /**
+   * This field is reserved for internal use
+   */
+  void *reserved;
+
+} CUpti_ActivityDeviceGraphTrace;
+
+/**
+ * \brief The activity record for trace of decompression operations.
+ *
+ * This activity record represents execution for a batch of decompression operatios.
+ * The activity kind is CUPTI_ACTIVITY_KIND_MEM_DECOMPRESS
+ */
+typedef struct {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEM_DECOMPRESS
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the device.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the HW channel on which the memory copy is occurring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   * The correlation ID of the decompression operations. Each operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The number of operations in the batch.
+   */
+  uint32_t numberOfOperations;
+
+  /**
+   * The number of bytes to be read and decompressed in the
+   * batch operation.
+   */
+  uint64_t sourceBytes;
+
+  /**
+   * This field is reserved for internal use
+   */
+  void *reserved0;
+
+  /**
+   * The start timestamp.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the start time is unknown.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the start time is unknown.
+   */
+  uint64_t end;
+} CUpti_ActivityMemDecompress;
+
+END_PACKED_ALIGNMENT
+
+/**
+ * \brief Activity attributes.
+ *
+ * These attributes are used to control the behavior of the activity
+ * API.
+ */
+typedef enum {
+    /**
+     * The device memory size (in bytes) reserved for storing profiling data for concurrent
+     * kernels (activity kind \ref CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), memcopies and memsets
+     * for each buffer on a context. The value is a size_t.
+     *
+     * There is a limit on how many device buffers can be allocated per context. User
+     * can query and set this limit using the attribute
+     * \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT.
+     * CUPTI doesn't pre-allocate all the buffers, it pre-allocates only those many
+     * buffers as set by the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE.
+     * When all of the data in a buffer is consumed, it is added in the reuse pool, and
+     * CUPTI picks a buffer from this pool when a new buffer is needed. Thus memory
+     * footprint does not scale with the kernel count. Applications with the high density
+     * of kernels, memcopies and memsets might result in having CUPTI to allocate more device buffers.
+     * CUPTI allocates another buffer only when it runs out of the buffers in the
+     * reuse pool.
+     *
+     * Since buffer allocation happens in the main application thread, this might result
+     * in stalls in the critical path. CUPTI pre-allocates 3 buffers of the same size to
+     * mitigate this issue. User can query and set the pre-allocation limit using the
+     * attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE.
+     *
+     * Having larger buffer size leaves less device memory for the application.
+     * Having smaller buffer size increases the risk of dropping timestamps for
+     * records if too many kernels or memcopies or memsets are launched at one time.
+     *
+     * This value only applies to new buffer allocations. Set this value before initializing
+     * CUDA or before creating a context to ensure it is considered for the following allocations.
+     *
+     * The default value is 3200000 (~3MB) which can accommodate profiling data
+     * up to 100,000 kernels, memcopies and memsets combined.
+     *
+     * Note: Starting with the CUDA 12.0 Update 1 release, CUPTI allocates profiling buffer in the
+     * device memory by default as this might help in improving the performance of the
+     * tracing run. Refer to the description of the attribute
+     * \ref CUPTI_ACTIVITY_ATTR_MEM_ALLOCATION_TYPE_HOST_PINNED for more details.
+     * Size of the memory and maximum number of pools are still controlled by the attributes
+     * \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE and \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT.
+     *
+     * Note: The actual amount of device memory per buffer reserved by CUPTI might be larger.
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE                      = 0,
+
+    /**
+     * The device memory size (in bytes) reserved for storing profiling
+     * data for CDP operations for each buffer on a context. The
+     * value is a size_t.
+     *
+     * Having larger buffer size means less flush operations but
+     * consumes more device memory. This value only applies to new
+     * allocations.
+     *
+     * Set this value before initializing CUDA or before creating a
+     * context to ensure it is considered for the following allocations.
+     *
+     * The default value is 8388608 (8MB).
+     *
+     * Note: The actual amount of device memory per context reserved by
+     * CUPTI might be larger.
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP          = 1,
+
+    /**
+     * The maximum number of device memory buffers per context. The value is a size_t.
+     *
+     * For an application with high rate of kernel launches, memcopies and memsets having a bigger pool
+     * limit helps in timestamp collection for all these activities at the expense of a larger memory footprint.
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE
+     * for more details.
+     *
+     * Setting this value will not modify the number of memory buffers
+     * currently stored.
+     *
+     * Set this value before initializing CUDA to ensure the limit is
+     * not exceeded.
+     *
+     * The default value is 250.
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT                = 2,
+
+    /**
+     * This attribute is not supported starting with CUDA 12.3
+     * CUPTI no longer uses profiling semaphore pool to store profiling data.
+     *
+     * There is a limit on how many semaphore pools can be allocated per context. User
+     * can query and set this limit using the attribute
+     * \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT.
+     * CUPTI doesn't pre-allocate all the semaphore pools, it pre-allocates only those many
+     * semaphore pools as set by the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE.
+     * When all of the data in a semaphore pool is consumed, it is added in the reuse pool, and
+     * CUPTI picks a semaphore pool from the reuse pool when a new semaphore pool is needed. Thus memory
+     * footprint does not scale with the kernel count. Applications with the high density
+     * of kernels might result in having CUPTI to allocate more semaphore pools.
+     * CUPTI allocates another semaphore pool only when it runs out of the semaphore pools in the
+     * reuse pool.
+     *
+     * Since semaphore pool allocation happens in the main application thread, this might result
+     * in stalls in the critical path. CUPTI pre-allocates 3 semaphore pools of the same size to
+     * mitigate this issue. User can query and set the pre-allocation limit using the
+     * attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE.
+     *
+     * Having larger semaphore pool size leaves less device memory for the application.
+     * Having smaller semaphore pool size increases the risk of dropping timestamps for
+     * kernel records if too many kernels are issued/launched at one time.
+     *
+     * This value only applies to new semaphore pool allocations. Set this value before initializing
+     * CUDA or before creating a context to ensure it is considered for the following allocations.
+     *
+     * The default value is 25000 which can accommodate profiling data for upto 25,000 kernels.
+     *
+     */
+    CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE           = 3,
+
+    /**
+     * This attribute is not supported starting with CUDA 12.3
+     * CUPTI no longer uses profiling semaphore pool to store profiling data.
+     *
+     * The maximum number of profiling semaphore pools per context. The value is a size_t.
+     *
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE
+     * for more details.
+     *
+     * Set this value before initializing CUDA to ensure the limit is not exceeded.
+     *
+     * The default value is 250.
+     */
+    CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT          = 4,
+
+    /**
+     * The flag to indicate whether user should provide activity buffer of zero value.
+     * The value is a uint8_t.
+     *
+     * If the value of this attribute is non-zero, user should provide
+     * a zero value buffer in the \ref CUpti_BuffersCallbackRequestFunc.
+     * If the user does not provide a zero value buffer after setting this to non-zero,
+     * the activity buffer may contain some uninitialized values when CUPTI returns it in
+     * \ref CUpti_BuffersCallbackCompleteFunc
+     *
+     * If the value of this attribute is zero, CUPTI will initialize the user buffer
+     * received in the \ref CUpti_BuffersCallbackRequestFunc to zero before filling it.
+     * If the user sets this to zero, a few stalls may appear in critical path because CUPTI
+     * will zero out the buffer in the main thread.
+     * Set this value before returning from \ref CUpti_BuffersCallbackRequestFunc to
+     * ensure it is considered for all the subsequent user buffers.
+     *
+     * The default value is 0.
+     */
+    CUPTI_ACTIVITY_ATTR_ZEROED_OUT_ACTIVITY_BUFFER              = 5,
+
+    /**
+     * Number of device buffers to pre-allocate for a context during the initialization phase.
+     * The value is a size_t.
+     *
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE
+     * for details.
+     *
+     * This value must be less than the maximum number of device buffers set using
+     * the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT
+     *
+     * Set this value before initializing CUDA or before creating a context to ensure it
+     * is considered by the CUPTI.
+     *
+     * The default value is set to 3 to ping pong between these buffers (if possible).
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE        = 6,
+
+    /**
+     * This attribute is not supported starting with CUDA 12.3
+     * CUPTI no longer uses profiling semaphore pool to store profiling data.
+     *
+     * Number of profiling semaphore pools to pre-allocate for a context during the
+     * initialization phase. The value is a size_t.
+     *
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE
+     * for details.
+     *
+     * This value must be less than the maximum number of profiling semaphore pools set
+     * using the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT
+     *
+     * Set this value before initializing CUDA or before creating a context to ensure it
+     * is considered by the CUPTI.
+     *
+     * The default value is set to 3 to ping pong between these pools (if possible).
+     */
+    CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE  = 7,
+
+    /**
+     * Allocate page-locked (pinned) host memory for storing profiling data for concurrent
+     * kernels, memcopies and memsets for each buffer on a context. The value is a uint8_t.
+     *
+     * Starting with the CUDA 11.2 release, CUPTI allocates profiling buffer in the pinned host
+     * memory by default as this might help in improving the performance of the tracing run.
+     * Allocating excessive amounts of pinned memory may degrade system performance, since it
+     * reduces the amount of memory available to the system for paging. For this reason user
+     * might want to change the location from pinned host memory to device memory by setting
+     * value of this attribute to 0.
+     *
+     * Using page-locked (pinned) host memory buffers is not supported on confidential computing
+     * devices. On setting this attribute to 1, CUPTI will return CUPTI_ERROR_NOT_SUPPORTED.
+     *
+     * The default value is 1.
+     */
+    CUPTI_ACTIVITY_ATTR_MEM_ALLOCATION_TYPE_HOST_PINNED         = 8,
+
+    /**
+     * Request activity buffers per-thread to store CUPTI activity records
+     * in the activity buffer on per-thread basis. The value is a uint8_t.
+     *
+     * The attribute should be set before registering the buffer callbacks using
+     * cuptiActivityRegisterCallbacks API and before any of the CUPTI activity kinds are enabled.
+     * This makes sure that all the records are stored in activity buffers allocated per-thread.
+     * Changing this attribute in the middle of the profiling session will result in undefined behavior.
+     *
+     * The default value is 0.
+     */
+    CUPTI_ACTIVITY_ATTR_PER_THREAD_ACTIVITY_BUFFER,
+
+
+
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_FORCE_INT                 = 0x7fffffff
+} CUpti_ActivityAttribute;
+
+/**
+ * \brief Thread-Id types.
+ *
+ * CUPTI uses different methods to obtain the thread-id depending on the
+ * support and the underlying platform. This enum documents these methods
+ * for each type. APIs \ref cuptiSetThreadIdType and \ref cuptiGetThreadIdType
+ * can be used to set and get the thread-id type.
+ */
+typedef enum {
+    /**
+     * Default type
+     * Windows uses API GetCurrentThreadId()
+     * Linux/Mac/Android/QNX use POSIX pthread API pthread_self()
+     */
+    CUPTI_ACTIVITY_THREAD_ID_TYPE_DEFAULT       = 0,
+
+    /**
+     * This type is based on the system API available on the underlying platform
+     * and thread-id obtained is supposed to be unique for the process lifetime.
+     * Windows uses API GetCurrentThreadId()
+     * Linux uses syscall SYS_gettid
+     * Mac uses syscall SYS_thread_selfid
+     * Android/QNX use gettid()
+     */
+    CUPTI_ACTIVITY_THREAD_ID_TYPE_SYSTEM        = 1,
+
+    /**
+     * Add new enums before this field.
+     */
+    CUPTI_ACTIVITY_THREAD_ID_TYPE_SIZE          = 2,
+
+    CUPTI_ACTIVITY_THREAD_ID_TYPE_FORCE_INT     = 0x7fffffff
+} CUpti_ActivityThreadIdType;
+
+/**
+ * \brief Get the CUPTI timestamp.
+ *
+ * Returns a timestamp normalized to correspond with the start and end
+ * timestamps reported in the CUPTI activity records. The timestamp is
+ * reported in nanoseconds.
+ *
+ * \param timestamp Returns the CUPTI timestamp
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p timestamp is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetTimestamp(uint64_t *timestamp);
+
+/**
+ * \brief Get the ID of a context.
+ *
+ * Get the ID of a context.
+ *
+ * \param context The context
+ * \param contextId Returns a process-unique ID for the context
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT The context is NULL or not valid.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p contextId is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetContextId(CUcontext context, uint32_t *contextId);
+
+/**
+ * \brief Get the ID of a stream.
+ *
+ * Get the ID of a stream. The stream ID is unique within a context
+ * (i.e. all streams within a context will have unique stream
+ * IDs).
+ *
+ * \param context If non-NULL then the stream is checked to ensure
+ * that it belongs to this context. Typically this parameter should be
+ * null.
+ * \param stream The stream
+ * \param streamId Returns a context-unique ID for the stream
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_STREAM if unable to get stream ID, or
+ * if \p context is non-NULL and \p stream does not belong to the
+ * context
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p streamId is NULL
+ *
+ * **DEPRECATED** This method is deprecated as of CUDA 8.0.
+ * Use method cuptiGetStreamIdEx instead.
+ */
+CUptiResult CUPTIAPI cuptiGetStreamId(CUcontext context, CUstream stream, uint32_t *streamId);
+
+/**
+* \brief Get the ID of a stream.
+*
+* Get the ID of a stream. The stream ID is unique within a context
+* (i.e. all streams within a context will have unique stream
+* IDs).
+*
+* \param context If non-NULL then the stream is checked to ensure
+* that it belongs to this context. Typically this parameter should be
+* null.
+* \param stream The stream
+* \param perThreadStream Flag to indicate if program is compiled for per-thread streams
+* \param streamId Returns a context-unique ID for the stream
+*
+* \retval CUPTI_SUCCESS
+* \retval CUPTI_ERROR_NOT_INITIALIZED
+* \retval CUPTI_ERROR_INVALID_STREAM if unable to get stream ID, or
+* if \p context is non-NULL and \p stream does not belong to the
+* context
+* \retval CUPTI_ERROR_INVALID_PARAMETER if \p streamId is NULL
+*/
+CUptiResult CUPTIAPI cuptiGetStreamIdEx(CUcontext context, CUstream stream, uint8_t perThreadStream, uint32_t *streamId);
+
+/**
+ * \brief Get the ID of a device
+ *
+ * If \p context is NULL, returns the ID of the device that contains
+ * the currently active context. If \p context is non-NULL, returns
+ * the ID of the device which contains that context. Operates in a
+ * similar manner to cudaGetDevice() or cuCtxGetDevice() but may be
+ * called from within callback functions.
+ *
+ * \param context The context, or NULL to indicate the current context.
+ * \param deviceId Returns the ID of the device that is current for
+ * the calling thread.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE if unable to get device ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p deviceId is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetDeviceId(CUcontext context, uint32_t *deviceId);
+
+/**
+ * \brief Get the unique ID of a graph node
+ *
+ * Returns the unique ID of the CUDA graph node.
+ *
+ * \param node The graph node.
+ * \param nodeId Returns the unique ID of the node
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p node is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetGraphNodeId(CUgraphNode node, uint64_t *nodeId);
+
+/**
+ * \brief Get the unique ID of graph
+ *
+ * Returns the unique ID of CUDA graph.
+ *
+ * \param graph The graph.
+ * \param pId Returns the unique ID of the graph
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p graph is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetGraphId(CUgraph graph, uint32_t *pId);
+
+/**
+ * \brief Get the unique ID of executable graph
+ *
+ * Returns the unique ID of executable CUDA graph.
+ *
+ * \param graphExec The executable graph.
+ * \param pId Returns the unique ID of the executable graph
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p graph is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetGraphExecId(CUgraphExec graphExec, uint32_t *pId);
+
+/**
+ * \brief Enable collection of a specific kind of activity record.
+ *
+ * Enable collection of a specific kind of activity record. Multiple
+ * kinds can be enabled by calling this function multiple times. By
+ * default all activity kinds are disabled for collection.
+ *
+ * \param kind The kind of activity record to collect
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityEnable(CUpti_ActivityKind kind);
+
+/**
+ * \brief Enable collection of a specific kind of activity record. For certain activity kinds
+ * it dumps existing records.
+ *
+ * In general, the behavior of this API is similar to the API \ref cuptiActivityEnable i.e. it
+ * enables the collection of a specific kind of activity record.
+ * Additionally, this API can help in dumping the records for activities which happened in
+ * the past before enabling the corresponding activity kind.
+ * The API allows to get records for the current resource allocations done in CUDA
+ * For CUPTI_ACTIVITY_KIND_DEVICE, existing device records are dumped
+ * For CUPTI_ACTIVITY_KIND_CONTEXT, existing context records are dumped
+ * For CUPTI_ACTIVITY_KIND_STREAM, existing stream records are dumped
+ * For CUPTI_ACTIVITY_KIND_ NVLINK, existing NVLINK records are dumped
+ * For CUPTI_ACTIVITY_KIND_PCIE, existing PCIE records are dumped
+ * For other activities, the behavior is similar to the API \ref cuptiActivityEnable
+ *
+ * Device records are emitted in CUPTI on CUDA driver initialization. Those records
+ * can only be retrieved by the user if CUPTI is attached before CUDA initialization.
+ * Context and stream records are emitted on context and stream creation.
+ * The use case of the API is to provide the records for CUDA resources
+ * (contexts/streams/devices) that are currently active if user late attaches CUPTI.
+ *
+ * Before calling this function, the user must register buffer callbacks
+ * to get the activity records by calling \ref cuptiActivityRegisterCallbacks.
+ * If the user does not register the buffers and calls API \ref cuptiActivityEnableAndDump,
+ * then CUPTI will enable the activity kind but not provide any records for that
+ * activity kind.
+ *
+ * \param kind The kind of activity record to collect
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_UNKNOWN if buffer is not initialized.
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableAndDump(CUpti_ActivityKind kind);
+
+/**
+ * \brief Disable collection of a specific kind of activity record.
+ *
+ * Disable collection of a specific kind of activity record. Multiple
+ * kinds can be disabled by calling this function multiple times. By
+ * default all activity kinds are disabled for collection.
+ *
+ * \param kind The kind of activity record to stop collecting
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityDisable(CUpti_ActivityKind kind);
+
+/**
+ * \brief Enable collection of a specific kind of activity record for
+ * a context.
+ *
+ * Enable collection of a specific kind of activity record for a
+ * context.  This setting done by this API will supersede the global
+ * settings for activity records enabled by \ref cuptiActivityEnable.
+ * Multiple kinds can be enabled by calling this function multiple
+ * times.
+ *
+ * \param context The context for which activity is to be enabled
+ * \param kind The kind of activity record to collect
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableContext(CUcontext context, CUpti_ActivityKind kind);
+
+/**
+ * \brief Disable collection of a specific kind of activity record for
+ * a context.
+ *
+ * Disable collection of a specific kind of activity record for a context.
+ * This setting done by this API will supersede the global settings
+ * for activity records.
+ * Multiple kinds can be enabled by calling this function multiple times.
+ *
+ * \param context The context for which activity is to be disabled
+ * \param kind The kind of activity record to stop collecting
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityDisableContext(CUcontext context, CUpti_ActivityKind kind);
+
+/**
+ * \brief Get the number of activity records that were dropped of
+ * insufficient buffer space.
+ *
+ * Get the number of records that were dropped because of insufficient
+ * buffer space.  The dropped count includes records that could not be
+ * recorded because CUPTI did not have activity buffer space available
+ * for the record (because the CUpti_BuffersCallbackRequestFunc
+ * callback did not return an empty buffer of sufficient size) and
+ * also CDP records that could not be record because the device-size
+ * buffer was full (size is controlled by the
+ * CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP attribute). The dropped
+ * count maintained for the queue is reset to zero when this function
+ * is called.
+ *
+ * \param context The context, or NULL to get dropped count from global queue
+ * \param streamId The stream ID
+ * \param dropped The number of records that were dropped since the last call
+ * to this function.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p dropped is NULL
+ */
+CUptiResult CUPTIAPI cuptiActivityGetNumDroppedRecords(CUcontext context, uint32_t streamId,
+                                                       size_t *dropped);
+
+/**
+ * \brief Iterate over the activity records in a buffer.
+ *
+ * This is a helper function to iterate over the activity records in a
+ * buffer. A buffer of activity records is typically obtained by
+ * receiving a CUpti_BuffersCallbackCompleteFunc callback. Stop iterating
+ * the buffer when an error occurs.
+ *
+ * An example of typical usage:
+ * \code
+ * CUpti_Activity *record = NULL;
+ * CUptiResult status = CUPTI_SUCCESS;
+ *   do {
+ *      status = cuptiActivityGetNextRecord(buffer, validSize, &record);
+ *      if(status == CUPTI_SUCCESS) {
+ *           // Use record here...
+ *      }
+ *      else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED)
+ *          break;
+ *      else if (status == CUPTI_ERROR_INVALID_KIND)
+ *          break;
+ *      else {
+ *          goto Error;
+ *      }
+ *    } while (1);
+ * \endcode
+ *
+ * \param buffer The buffer containing activity records
+ * \param record Inputs the previous record returned by
+ * cuptiActivityGetNextRecord and returns the next activity record
+ * from the buffer. If input value is NULL, returns the first activity
+ * record in the buffer. Records of certain kinds like CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL
+ * may contain invalid (0) timestamps, indicating that no timing information could
+ * be collected for lack of device memory.
+ * \param validBufferSizeBytes The number of valid bytes in the buffer.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_MAX_LIMIT_REACHED if no more records in the buffer
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p buffer is NULL.
+ * \retval CUPTI_ERROR_INVALID_KIND if activity record is either incomplete or invalid
+ */
+CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t* buffer, size_t validBufferSizeBytes,
+                                                CUpti_Activity **record);
+
+/**
+ * \brief Function type for callback used by CUPTI to request an empty
+ * buffer for storing activity records.
+ *
+ * This callback function signals the CUPTI client that an activity
+ * buffer is needed by CUPTI. The activity buffer is used by CUPTI to
+ * store activity records. The callback function can decline the
+ * request by setting \p *buffer to NULL. In this case CUPTI may drop
+ * activity records.
+ *
+ * \param buffer Returns the new buffer. If set to NULL then no buffer
+ * is returned.
+ * \param size Returns the size of the returned buffer.
+ * \param maxNumRecords Returns the maximum number of records that
+ * should be placed in the buffer. If 0 then the buffer is filled with
+ * as many records as possible. If > 0 the buffer is filled with at
+ * most that many records before it is returned.
+ */
+typedef void (CUPTIAPI *CUpti_BuffersCallbackRequestFunc)(
+    uint8_t **buffer,
+    size_t *size,
+    size_t *maxNumRecords);
+
+/**
+ * \brief Function type for callback used by CUPTI to return a buffer
+ * of activity records.
+ *
+ * This callback function returns to the CUPTI client a buffer
+ * containing activity records.  The buffer contains \p validSize
+ * bytes of activity records which should be read using
+ * cuptiActivityGetNextRecord. The number of dropped records can be
+ * read using cuptiActivityGetNumDroppedRecords. After this call CUPTI
+ * relinquished ownership of the buffer and will not use it
+ * anymore. The client may return the buffer to CUPTI using the
+ * CUpti_BuffersCallbackRequestFunc callback.
+ * Note: CUDA 6.0 onwards, all buffers returned by this callback are
+ * global buffers i.e. there is no context/stream specific buffer.
+ * User needs to parse the global buffer to extract the context/stream
+ * specific activity records.
+ *
+ * \param context The context this buffer is associated with. If NULL, the
+ * buffer is associated with the global activities. This field is deprecated
+ * as of CUDA 6.0 and will always be NULL.
+ * \param streamId The stream id this buffer is associated with.
+ * This field is deprecated as of CUDA 6.0 and will always be NULL.
+ * \param buffer The activity record buffer.
+ * \param size The total size of the buffer in bytes as set in
+ * CUpti_BuffersCallbackRequestFunc.
+ * \param validSize The number of valid bytes in the buffer.
+ */
+typedef void (CUPTIAPI *CUpti_BuffersCallbackCompleteFunc)(
+    CUcontext context,
+    uint32_t streamId,
+    uint8_t *buffer,
+    size_t size,
+    size_t validSize);
+
+/**
+ * \brief Registers callback functions with CUPTI for activity buffer
+ * handling.
+ *
+ * This function registers two callback functions to be used in asynchronous
+ * buffer handling. If registered, activity record buffers are handled using
+ * asynchronous requested/completed callbacks from CUPTI.
+ *
+ * Registering these callbacks prevents the client from using CUPTI's
+ * blocking enqueue/dequeue functions.
+ *
+ * \param funcBufferRequested callback which is invoked when an empty
+ * buffer is requested by CUPTI
+ * \param funcBufferCompleted callback which is invoked when a buffer
+ * containing activity records is available from CUPTI
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if either \p
+ * funcBufferRequested or \p funcBufferCompleted is NULL
+ */
+CUptiResult CUPTIAPI cuptiActivityRegisterCallbacks(CUpti_BuffersCallbackRequestFunc funcBufferRequested,
+        CUpti_BuffersCallbackCompleteFunc funcBufferCompleted);
+
+/**
+ * \brief Wait for all activity records to be delivered via the
+ * completion callback.
+ *
+ * This function does not return until all activity records associated
+ * with the specified context/stream are returned to the CUPTI client
+ * using the callback registered in cuptiActivityRegisterCallbacks. To
+ * ensure that all activity records are complete, the requested
+ * stream(s), if any, are synchronized.
+ *
+ * If \p context is NULL, the global activity records (i.e. those not
+ * associated with a particular stream) are flushed (in this case no
+ * streams are synchronized).  If \p context is a valid CUcontext and
+ * \p streamId is 0, the buffers of all streams of this context are
+ * flushed.  Otherwise, the buffers of the specified stream in this
+ * context is flushed.
+ *
+ * Before calling this function, the buffer handling callback api
+ * must be activated by calling cuptiActivityRegisterCallbacks.
+ *
+ * \param context A valid CUcontext or NULL.
+ * \param streamId The stream ID.
+ * \param flag The flag can be set to indicate a forced flush. See CUpti_ActivityFlag
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_CUPTI_ERROR_INVALID_OPERATION if not preceded
+ * by a successful call to cuptiActivityRegisterCallbacks
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred
+ *
+ * **DEPRECATED** This method is deprecated
+ * CONTEXT and STREAMID will be ignored. Use cuptiActivityFlushAll
+ * to flush all data.
+ */
+CUptiResult CUPTIAPI cuptiActivityFlush(CUcontext context, uint32_t streamId, uint32_t flag);
+
+/**
+ * \brief Request to deliver activity records via the buffer completion callback.
+ *
+ * This function returns the activity records associated with all contexts/streams
+ * (and the global buffers not associated with any stream) to the CUPTI client
+ * using the callback registered in cuptiActivityRegisterCallbacks.
+ *
+ * This is a blocking call but it doesn't issue any CUDA synchronization calls
+ * implicitly thus it's not guaranteed that all activities are completed on the
+ * underlying devices. Activity record is considered as completed if it has all
+ * the information filled up including the timestamps if any. It is the client's
+ * responsibility to issue necessary CUDA synchronization calls before calling
+ * this function if all activity records with complete information are expected
+ * to be delivered.
+ *
+ * Behavior of the function based on the input flag:
+ * (-) ::For default flush i.e. when flag is set as 0, it returns all the
+ * activity buffers which have all the activity records completed, buffers need not
+ * to be full though. It doesn't return buffers which have one or more incomplete
+ * records. Default flush can be done at a regular interval in a separate thread.
+ * (-) ::For forced flush i.e. when flag CUPTI_ACTIVITY_FLAG_FLUSH_FORCED is passed
+ * to the function, it returns all the activity buffers including the ones which have
+ * one or more incomplete activity records. It's suggested for clients to do the
+ * force flush before the termination of the profiling session to allow remaining
+ * buffers to be delivered. In general, it can be done in the at-exit handler.
+ *
+ * Before calling this function, the buffer handling callback api must be activated
+ * by calling cuptiActivityRegisterCallbacks.
+ *
+ * \param flag The flag can be set to indicate a forced flush. See CUpti_ActivityFlag
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if not preceded by a
+ * successful call to cuptiActivityRegisterCallbacks
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred
+ *
+ * \see cuptiActivityFlushPeriod
+ */
+CUptiResult CUPTIAPI cuptiActivityFlushAll(uint32_t flag);
+
+/**
+ * \brief Read an activity API attribute.
+ *
+ * Read an activity API attribute and return it in \p *value.
+ *
+ * \param attr The attribute to read
+ * \param valueSize Size of buffer pointed by the value, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the value of the attribute
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value is NULL, or
+ * if \p attr is not an activity attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that
+ * the \p value buffer is too small to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiActivityGetAttribute(CUpti_ActivityAttribute attr,
+        size_t *valueSize, void* value);
+
+/**
+ * \brief Write an activity API attribute.
+ *
+ * Write an activity API attribute.
+ *
+ * \param attr The attribute to write
+ * \param valueSize The size, in bytes, of the value
+ * \param value The attribute value to write
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value is NULL, or
+ * if \p attr is not an activity attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that
+ * the \p value buffer is too small to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiActivitySetAttribute(CUpti_ActivityAttribute attr,
+        size_t *valueSize, void* value);
+
+
+/**
+ * \brief Set Unified Memory Counter configuration.
+ *
+ * Set the configuration before enabling the corresponding activity kind
+ * CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER.
+ * The API should be called after CUDA driver initialization.
+ *
+ * \param config A pointer to \ref CUpti_ActivityUnifiedMemoryCounterConfig structures
+ * containing Unified Memory counter configuration.
+ * \param count Number of Unified Memory counter configuration structures
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p config is NULL or
+ * any parameter in the \p config structures is not a valid value
+ * \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED One potential reason is that
+ * platform (OS/arch) does not support the unified memory counters
+ * \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE Indicates that the device
+ * does not support the unified memory counters
+ * \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES Indicates that
+ * multi-GPU configuration without P2P support between any pair of devices
+ * does not support the unified memory counters
+ */
+CUptiResult CUPTIAPI cuptiActivityConfigureUnifiedMemoryCounter(CUpti_ActivityUnifiedMemoryCounterConfig *config, uint32_t count);
+
+/**
+ * \brief Get auto boost state
+ *
+ * The profiling results can be inconsistent in case auto boost is enabled.
+ * CUPTI tries to disable auto boost while profiling. It can fail to disable in
+ * cases where user does not have the permissions or CUDA_AUTO_BOOST env
+ * variable is set. The function can be used to query whether auto boost is
+ * enabled.
+ *
+ * \param context A valid CUcontext.
+ * \param state A pointer to \ref CUpti_ActivityAutoBoostState structure which
+ * contains the current state and the id of the process that has requested the
+ * current state
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p CUcontext or \p state is NULL
+ * \retval CUPTI_ERROR_NOT_SUPPORTED Indicates that the device does not support auto boost
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred
+ */
+CUptiResult CUPTIAPI cuptiGetAutoBoostState(CUcontext context, CUpti_ActivityAutoBoostState *state);
+
+/**
+ * \brief Set PC sampling configuration.
+ *
+ * For Pascal and older GPU architectures this API must be called before enabling
+ * activity kind CUPTI_ACTIVITY_KIND_PC_SAMPLING. There is no such requirement
+ * for Volta and newer GPU architectures.
+ *
+ * For Volta and newer GPU architectures if this API is called in the middle of
+ * execution, PC sampling configuration will be updated for subsequent kernel launches.
+ *
+ * \param ctx The context
+ * \param config A pointer to \ref CUpti_ActivityPCSamplingConfig structure
+ * containing PC sampling configuration.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this api is called while
+ * some valid event collection method is set.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p config is NULL or
+ * any parameter in the \p config structures is not a valid value
+ * \retval CUPTI_ERROR_NOT_SUPPORTED Indicates that the system/device
+ * does not support the unified memory counters
+ */
+CUptiResult CUPTIAPI cuptiActivityConfigurePCSampling(CUcontext ctx, CUpti_ActivityPCSamplingConfig *config);
+
+/**
+ * \brief Returns the last error from a cupti call or callback
+ *
+ * Returns the last error that has been produced by any of the cupti api calls
+ * or the callback in the same host thread and resets it to CUPTI_SUCCESS.
+ */
+CUptiResult CUPTIAPI cuptiGetLastError(void);
+
+/**
+ * \brief Set the thread-id type
+ *
+ * CUPTI uses the method corresponding to set type to generate the thread-id.
+ * See enum \ref CUpti_ActivityThreadIdType for the list of methods.
+ * Activity records having thread-id field contain the same value.
+ * Thread id type must not be changed during the profiling session to
+ * avoid thread-id value mismatch across activity records.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_SUPPORTED if \p type is not supported on the platform
+ */
+CUptiResult CUPTIAPI cuptiSetThreadIdType(CUpti_ActivityThreadIdType type);
+
+/**
+ * \brief Get the thread-id type
+ *
+ * Returns the thread-id type used in CUPTI
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p type is NULL
+  */
+CUptiResult CUPTIAPI cuptiGetThreadIdType(CUpti_ActivityThreadIdType *type);
+
+/**
+* \brief Check support for a compute capability
+*
+* This function is used to check the support for a device based on
+* it's compute capability. It sets the \p support when the compute
+* capability is supported by the current version of CUPTI, and clears
+* it otherwise. This version of CUPTI might not support all GPUs sharing
+* the same compute capability. It is suggested to use API \ref
+* cuptiDeviceSupported which provides correct information.
+*
+* \param major The major revision number of the compute capability
+* \param minor The minor revision number of the compute capability
+* \param support Pointer to an integer to return the support status
+*
+* \retval CUPTI_SUCCESS
+* \retval CUPTI_ERROR_INVALID_PARAMETER if \p support is NULL
+*
+* \sa ::cuptiDeviceSupported
+*/
+CUptiResult CUPTIAPI cuptiComputeCapabilitySupported(int major, int minor, int *support);
+
+/**
+* \brief Check support for a compute device
+*
+* This function is used to check the support for a compute device.
+* It sets the \p support when the device is supported by the current
+* version of CUPTI, and clears it otherwise.
+*
+* \param dev The device handle returned by CUDA Driver API cuDeviceGet
+* \param support Pointer to an integer to return the support status
+*
+* \retval CUPTI_SUCCESS
+* \retval CUPTI_ERROR_INVALID_PARAMETER if \p support is NULL
+* \retval CUPTI_ERROR_INVALID_DEVICE if \p dev is not a valid device
+*
+* \sa ::cuptiComputeCapabilitySupported
+*/
+CUptiResult CUPTIAPI cuptiDeviceSupported(CUdevice dev, int *support);
+
+/**
+ * This indicates the virtualization mode in which CUDA device is running
+ */
+typedef enum {
+  /**
+   * No virtualization mode is associated with the device
+   * i.e. it's a baremetal GPU
+   */
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_NONE = 0,
+  /**
+   * The device is associated with the pass-through GPU.
+   * In this mode, an entire physical GPU is directly assigned
+   * to one virtual machine (VM).
+   */
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_PASS_THROUGH = 1,
+  /**
+   * The device is associated with the virtual GPU (vGPU).
+   * In this mode multiple virtual machines (VMs) have simultaneous,
+   * direct access to a single physical GPU.
+   */
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_VIRTUAL_GPU = 2,
+
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_FORCE_INT = 0x7fffffff
+} CUpti_DeviceVirtualizationMode;
+
+/**
+ * \brief Query the virtualization mode of the device
+ *
+ * This function is used to query the virtualization mode of the CUDA device.
+ *
+ * \param dev The device handle returned by CUDA Driver API cuDeviceGet
+ * \param mode Pointer to an CUpti_DeviceVirtualizationMode to return the virtualization mode
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_DEVICE if \p dev is not a valid device
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p mode is NULL
+ *
+ */
+CUptiResult CUPTIAPI cuptiDeviceVirtualizationMode(CUdevice dev, CUpti_DeviceVirtualizationMode *mode);
+
+/**
+ * \brief Detach CUPTI from the running process
+ *
+ * This API detaches the CUPTI from the running process. It destroys and cleans up all the
+ * resources associated with CUPTI in the current process. After CUPTI detaches from the process,
+ * the process will keep on running with no CUPTI attached to it.
+ * For safe operation of the API, it is recommended this API is invoked from the exit callsite
+ * of any of the CUDA Driver or Runtime API. Otherwise CUPTI client needs to make sure that
+ * required CUDA synchronization and CUPTI activity buffer flush is done before calling the API.
+ * Sample code showing the usage of the API in the cupti callback handler code:
+ * \code
+  void CUPTIAPI
+  cuptiCallbackHandler(void *userdata, CUpti_CallbackDomain domain,
+      CUpti_CallbackId cbid, void *cbdata)
+  {
+    const CUpti_CallbackData *cbInfo = (CUpti_CallbackData *)cbdata;
+
+    // Take this code path when CUPTI detach is requested
+    if (detachCupti) {
+      switch(domain)
+      {
+        case CUPTI_CB_DOMAIN_RUNTIME_API:
+        case CUPTI_CB_DOMAIN_DRIVER_API:
+          if (cbInfo->callbackSite == CUPTI_API_EXIT) {
+              // call the CUPTI detach API
+              cuptiFinalize();
+          }
+          break;
+        default:
+          break;
+      }
+    }
+  }
+ \endcode
+ */
+CUptiResult CUPTIAPI cuptiFinalize(void);
+
+/**
+ * \brief Push an external correlation id for the calling thread
+ *
+ * This function notifies CUPTI that the calling thread is entering an external API region.
+ * When a CUPTI activity API record is created while within an external API region and
+ * CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION is enabled, the activity API record will
+ * be preceded by a CUpti_ActivityExternalCorrelation record for each \ref CUpti_ExternalCorrelationKind.
+ *
+ * \param kind The kind of external API activities should be correlated with.
+ * \param id External correlation id.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER The external API kind is invalid
+ */
+CUptiResult CUPTIAPI cuptiActivityPushExternalCorrelationId(CUpti_ExternalCorrelationKind kind, uint64_t id);
+
+/**
+ * \brief Pop an external correlation id for the calling thread
+ *
+ * This function notifies CUPTI that the calling thread is leaving an external API region.
+ *
+ * \param kind The kind of external API activities should be correlated with.
+ * \param lastId If the function returns successful, contains the last external correlation id for this \p kind, can be NULL.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER The external API kind is invalid.
+ * \retval CUPTI_ERROR_QUEUE_EMPTY No external id is currently associated with \p kind.
+ */
+CUptiResult CUPTIAPI cuptiActivityPopExternalCorrelationId(CUpti_ExternalCorrelationKind kind, uint64_t *lastId);
+
+/**
+ * \brief Controls the collection of queued and submitted timestamps for kernels.
+ *
+ * This API is used to control the collection of queued and submitted timestamps
+ * for kernels whose records are provided through the struct \ref CUpti_ActivityKernel9.
+ * Default value is 0, i.e. these timestamps are not collected. This API needs
+ * to be called before initialization of CUDA and this setting should not be
+ * changed during the profiling session.
+ * 
+ * This API is not supported if the HW trace is enabled through the API \ref cuptiActivityEnableHWTrace. 
+ * \param enable is a boolean, denoting whether these timestamps should be
+ * collected
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableLatencyTimestamps(uint8_t enable);
+
+/**
+ * \brief Sets the flush period for the worker thread
+ *
+ * CUPTI creates a worker thread to minimize the perturbance for the application created
+ * threads. CUPTI offloads certain operations from the application threads to the worker
+ * thread, this includes synchronization of profiling resources between host and device,
+ * delivery of the activity buffers to the client using the callback registered in
+ * cuptiActivityRegisterCallbacks. For performance reasons, CUPTI wakes up the worker
+ * thread based on certain heuristics.
+ *
+ * This API is used to control the flush period of the worker thread. This setting will
+ * override the CUPTI heuristics. Setting time to zero disables the periodic flush and
+ * restores the default behavior.
+ *
+ * Periodic flush can return only those activity buffers which are full and have all the
+ * activity records completed.
+ *
+ * It's allowed to use the API \ref cuptiActivityFlushAll to flush the data on-demand, even
+ * when client sets the periodic flush.
+ *
+ * \param time flush period in milliseconds (ms)
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ *
+ * \see cuptiActivityFlushAll
+ */
+CUptiResult CUPTIAPI cuptiActivityFlushPeriod(uint32_t time);
+
+/**
+ * \brief Controls the collection of launch attributes for kernels.
+ *
+ * This API is used to control the collection of launch attributes for kernels whose
+ * records are provided through the struct \ref CUpti_ActivityKernel9.
+ * Default value is 0, i.e. these attributes are not collected.
+ *
+ * \param enable is a boolean denoting whether these launch attributes should be collected
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableLaunchAttributes(uint8_t enable);
+
+/**
+ * \brief Function type for callback used by CUPTI to request a timestamp
+ * to be used in activity records.
+ *
+ * This callback function signals the CUPTI client that a timestamp needs
+ * to be returned. This timestamp would be treated as normalized timestamp
+ * to be used for various purposes in CUPTI. For example to store start and
+ * end timestamps reported in the CUPTI activity records.
+ * The returned timestamp must be in nanoseconds.
+ *
+ * \sa ::cuptiActivityRegisterTimestampCallback
+ */
+typedef uint64_t (CUPTIAPI *CUpti_TimestampCallbackFunc)(void);
+
+/**
+ * \brief Registers callback function with CUPTI for providing timestamp.
+ *
+ * This function registers a callback function to obtain timestamp of user's
+ * choice instead of using CUPTI provided timestamp.
+ * By default CUPTI uses different methods, based on the underlying platform,
+ * to retrieve the timestamp
+ * Linux and Android use clock_gettime(CLOCK_REALTIME, ..)
+ * Windows uses QueryPerformanceCounter()
+ * QNX uses ClockCycles()
+ * Timestamps retrieved using these methods are converted to nanosecond if needed
+ * before usage.
+ *
+ * Timestamps for GPU activities such as kernels, memory copies and memset operations are
+ * recorded directly on the GPU. To provide a unified and normalized view of these timestamps
+ * in relation to CPU time, CUPTI performs a linear interpolation to convert GPU timestamps
+ * into CPU timestamps during post-processing.
+ * For activities where timestamps are captured on the GPU, the timestamp callback is invoked
+ * during the post-processing phase, while converting GPU timestamps into CPU timestamps.
+ * For activities for which timestamps are captured directly on the CPU, the timestamp callback
+ * is invoked immediately at the time of the activity.
+ *
+ * The registration of timestamp callback should be done before any of the CUPTI
+ * activity kinds are enabled to make sure that all the records report the timestamp using
+ * the callback function registered through cuptiActivityRegisterTimestampCallback API.
+ *
+ * Changing the timestamp callback function in CUPTI through
+ * cuptiActivityRegisterTimestampCallback API in the middle of the profiling
+ * session can cause records generated prior to the change to report
+ * timestamps through previous timestamp method.
+ *
+ * \param funcTimestamp callback which is invoked when a timestamp is
+ * needed by CUPTI
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p funcTimestamp is NULL
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityRegisterTimestampCallback(CUpti_TimestampCallbackFunc funcTimestamp);
+
+/**
+ * \brief Controls the collection of records for device launched graphs.
+ *
+ * This API is used to control the collection of records for device launched graphs.
+ * Default value is 0, i.e. these records are not collected. This API needs
+ * to be called before initialization of CUDA and this setting should not be
+ * changed during the profiling session.
+ *
+ * \param enable is a boolean, denoting whether these records should be
+ * collected
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableDeviceGraph(uint8_t enable);
+
+/**
+ * \brief Controls the collection of activity records for specific CUDA Driver APIs.
+ *
+ * Activity kind CUPTI_ACTIVITY_KIND_DRIVER controls the collection of either all
+ * CUDA Driver APIs or none. API cuptiActivityEnableDriverApi can be used for fine-grained
+ * control, it allows enabling/disabling tracing of a specific set of CUDA Driver APIs.
+ * To disable collection of a small set of CUDA Driver APIs, user can
+ * first enable the collection of all Driver APIs using the activity kind
+ * CUPTI_ACTIVITY_KIND_DRIVER and call this API to disable specific Driver APIs.
+ * And to enable the collection of a small set of CUDA Driver APIs, user can
+ * call this API without using the activity kind CUPTI_ACTIVITY_KIND_DRIVER.
+ *
+ * Note: Activity kind CUPTI_ACTIVITY_KIND_DRIVER overrides the settings done by this API
+ * if it is called after the API.
+ *
+ * \param cbid callback id of the CUDA Driver API. This can be found in the header cupti_driver_cbid.h.
+ * \param enable is a boolean, denoting whether to enable or disable the collection
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableDriverApi(CUpti_CallbackId cbid, uint8_t enable);
+
+/**
+ * \brief Controls the collection of activity records for specific CUDA Runtime APIs.
+ *
+ * Activity kind CUPTI_ACTIVITY_KIND_RUNTIME controls the collection of either all
+ * CUDA Runtime APIs or none. API cuptiActivityEnableRuntimeApi can be used for fine-grained
+ * control, it allows enabling/disabling tracing of a specific set of CUDA Runtime APIs.
+ * To disable collection of a small set of CUDA Runtime APIs, user can
+ * first enable the collection of all Runtime APIs using the activity kind
+ * CUPTI_ACTIVITY_KIND_RUNTIME and call this API to disable specific Runtime APIs.
+ * And to enable the collection of a small set of CUDA Runtime APIs, user can
+ * call this API without using the activity kind CUPTI_ACTIVITY_KIND_RUNTIME.
+ *
+ * Note: Activity kind CUPTI_ACTIVITY_KIND_RUNTIME overrides the settings done by this API
+ * if it is called after the API.
+ *
+ * \param cbid callback id of the CUDA Runtime API. This can be found in the header cupti_runtime_cbid.h.
+ * \param enable is a boolean, denoting whether to enable or disable the collection
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableRuntimeApi(CUpti_CallbackId cbid, uint8_t enable);
+
+/**
+ * \brief Enables the collection of CUDA kernel timestamps through HW events.
+ *
+ * This API enables the collection of CUDA kernel timestamps through HW events instead
+ * of the traditional SW instrumentation and semaphore based approach.
+ * This option is only available on Blackwell architecture.
+ * This API should be called after driver is initialized.
+ *
+ * \param enable is a boolean, denoting whether to enable or disable the collection through HW events
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if CUPTI is not initialized or the CUDA driver is not initialized
+ * \retval CUPTI_ERROR_NOT_SUPPORTED if HW trace cannot be enabled on the current platform
+ * \retval CUPTI_ERROR_VIRTUALIZED_DEVICE_NOT_SUPPORTED
+ * \retval CUPTI_ERROR_CONFIDENTIAL_COMPUTING_NOT_SUPPORTED
+ * \retval CUPTI_ERROR_CMP_DEVICE_NOT_SUPPORTED
+ * \retval CUPTI_ERROR_MIG_DEVICE_NOT_SUPPORTED
+ * \retval CUPTI_ERROR_SLI_DEVICE_NOT_SUPPORTED
+ * \retval CUPTI_ERROR_WSL_DEVICE_NOT_SUPPORTED
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableHWTrace(uint8_t enable);
+
+
+/**
+ *  \brief Enables tracking the source library for memory allocation requests.
+ *
+ * This API is used to control whether or not we track the source library of
+ * memory allocation requests. Default value is 0, i.e. it is not tracked. The
+ * activity kind CUPTI_ACTIVITY_KIND_MEMORY2 needs to be enabled, and if this flag is
+ * set, we get the full path of the shared object responsible for the GPU memory allocation
+ * request in the member source in the CUpti_ActivityMemory4 records. Also note that this feature
+ * adds runtime overhead.
+ *
+ * \param enable is a boolean, denoting whether the source library of the memory allocation
+ * request needs to be tracked
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+*/
+CUptiResult CUPTIAPI cuptiActivityEnableAllocationSource (uint8_t enable);
+
+/**
+ * \brief Enables collecting records for all synchronization operations.
+ *
+ * CUPTI provides CUDA event query and stream query records via CUPTI_ACTIVTIY_KIND_SYNCHRONIZATION.
+ * Using this API, CUPTI client can enable to record all CUDA event query and stream query records
+ * even if the event has not yet been completed and all operations on stream have not yet been completed
+ * respectively.
+ *
+ * By default, the record is only generated if all captured work has been completed for the CUDA event.
+ * By default, the record is only generated if all operations have been completed on the stream.
+ *
+ * \param enable is a boolean, denoting whether to enable or disable the collection of all CUDA event query
+ * and stream query records
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableAllSyncRecords(uint8_t enable);
+
+/** @} */ /* END CUPTI_ACTIVITY_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+// Including deprecated structures of CUPTI_ACTIVITY_API
+#include "cupti_activity_deprecated.h"
+
+#endif /*_CUPTI_ACTIVITY_H_*/
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_activity_deprecated.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_activity_deprecated.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9d725499ffa13ac7de864719abee2baa88d6c13
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_activity_deprecated.h
@@ -0,0 +1,5335 @@
+/*
+ * Copyright 2011-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_ACTIVITY_DEPRECATED_H_)
+#define _CUPTI_ACTIVITY_DEPRECATED_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \brief The kinds of activity records.
+ *
+ * Each activity record kind represents information about a GPU or an
+ * activity occurring on a CPU or GPU. Each kind is associated with a
+ * activity record structure that holds the information associated
+ * with the kind.
+ * \see CUpti_ActivityOverhead
+ * \see CUpti_ActivityOverhead2
+ * \see CUpti_ActivityDevice
+ * \see CUpti_ActivityDevice2
+ * \see CUpti_ActivityDevice3
+ * \see CUpti_ActivityDevice4
+ * \see CUpti_ActivityKernel
+ * \see CUpti_ActivityKernel2
+ * \see CUpti_ActivityKernel3
+ * \see CUpti_ActivityKernel4
+ * \see CUpti_ActivityKernel5
+ * \see CUpti_ActivityKernel6
+ * \see CUpti_ActivityKernel7
+ * \see CUpti_ActivityKernel8
+ * \see CUpti_ActivityMemcpy
+ * \see CUpti_ActivityMemcpy3
+ * \see CUpti_ActivityMemcpy4
+ * \see CUpti_ActivityMemcpyPtoP
+ * \see CUpti_ActivityMemcpyPtoP2
+ * \see CUpti_ActivityMemcpyPtoP3
+ * \see CUpti_ActivityMemset
+ * \see CUpti_ActivityMemset2
+ * \see CUpti_ActivityMemset3
+ * \see CUpti_ActivityMemory2
+ * \see CUpti_ActivityMemory3
+ * \see CUpti_ActivityMemoryPool
+ * \see CUpti_ActivityMarker
+ * \see CUpti_ActivityGlobalAccess
+ * \see CUpti_ActivityGlobalAccess2
+ * \see CUpti_ActivityBranch
+ * \see CUpti_ActivityPCSampling
+ * \see CUpti_ActivityPCSampling2
+ * \see CUpti_ActivityUnifiedMemoryCounter
+ * \see CUpti_ActivityUnifiedMemoryCounter2
+ * \see CUpti_ActivityNvLink
+ * \see CUpti_ActivityNvLink2
+ * \see CUpti_ActivityNvLink3
+ */
+
+/**
+ * \brief The activity record for CUPTI and driver overheads.
+ * (Deprecated in CUDA 12.2)
+ *
+ * This activity record provides CUPTI and driver overhead information
+ * (CUPTI_ACTIVITY_OVERHEAD). These records are now reported using
+ * CUpti_ActivityOverhead3
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of overhead, CUPTI, DRIVER, COMPILER etc.
+   */
+  CUpti_ActivityOverheadKind overheadKind;
+
+  /**
+   * The kind of activity object that the overhead is associated with.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * The start timestamp for the overhead, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the overhead.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the overhead, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the overhead.
+   */
+  uint64_t end;
+} CUpti_ActivityOverhead;
+
+/**
+ * \brief The activity record for CUPTI and driver overheads.
+ *
+ * This activity record provides CUPTI and driver overhead information
+ * (CUPTI_ACTIVITY_OVERHEAD).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of overhead, CUPTI, DRIVER, COMPILER etc.
+   */
+  CUpti_ActivityOverheadKind overheadKind;
+
+  /**
+   * The kind of activity object that the overhead is associated with.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * The start timestamp for the overhead, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the overhead.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the overhead, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the overhead.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the overhead operation to which
+   * records belong to. This ID is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the overhead operation.
+   * In some cases, it can be zero, such as for CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH records.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Reserved for internal use.
+   */
+  uint32_t reserved0;
+} CUpti_ActivityOverhead2;
+
+/**
+ * \brief The activity record for a device. (deprecated)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityDevice;
+
+/**
+ * \brief The activity record for a device. (deprecated)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityDevice2;
+
+/**
+ * \brief The activity record for a device. (CUDA 7.0 onwards)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Flag to indicate whether the device is visible to CUDA. Users can
+   * set the device visibility using CUDA_VISIBLE_DEVICES environment
+   */
+  uint8_t isCudaVisible;
+
+  uint8_t reserved[7];
+} CUpti_ActivityDevice3;
+
+/**
+ * \brief The activity record for a device. (CUDA 11.6 onwards)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Flag to indicate whether the device is visible to CUDA. Users can
+   * set the device visibility using CUDA_VISIBLE_DEVICES environment
+   */
+  uint8_t isCudaVisible;
+
+  /**
+   * MIG enabled flag for device
+   */
+  uint8_t isMigEnabled;
+
+  uint8_t reserved[6];
+
+  /**
+   * GPU Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t gpuInstanceId;
+
+  /**
+   * Compute Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t computeInstanceId;
+
+  /**
+   * The MIG UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid migUuid;
+
+} CUpti_ActivityDevice4;
+
+/**
+ * \brief The activity record for kernel. (deprecated)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL
+   * or CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The cache configuration requested by the kernel. The value is one
+   * of the CUfunc_cache enumeration values from cuda.h.
+   */
+  uint8_t cacheConfigRequested;
+
+  /**
+   * The cache configuration used for the kernel. The value is one of
+   * the CUfunc_cache enumeration values from cuda.h.
+   */
+  uint8_t cacheConfigExecuted;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the kernel. Each kernel execution
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t runtimeCorrelationId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel;
+
+/**
+ * \brief The activity record for kernel. (deprecated)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel2;
+
+/**
+ * \brief The activity record for a kernel (CUDA 6.5(with sm_52 support) onwards).
+ * (deprecated in CUDA 9.0)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL).
+ * Kernel activities are now reported using the CUpti_ActivityKernel9 activity
+ * record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel3;
+
+/**
+ * \brief The activity record for a kernel (CUDA 9.0(with sm_70 support) onwards).
+ * (deprecated in CUDA 11.0)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL).
+ * Kernel activities are now reported using the CUpti_ActivityKernel9 activity
+ * record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+} CUpti_ActivityKernel4;
+
+/**
+ * \brief The activity record for a kernel (CUDA 11.0(with sm_80 support) onwards).
+ * (deprecated in CUDA 11.2)
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+} CUpti_ActivityKernel5;
+
+/**
+ * \brief The activity record for kernel. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+} CUpti_ActivityKernel6;
+
+/**
+ * \brief The activity record for kernel. (deprecated in CUDA 11.8)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+
+  /**
+   * The ID of the HW channel on which the kernel is launched.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+} CUpti_ActivityKernel7;
+
+/**
+ * \brief The activity record for kernel.
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes (deprecated in CUDA 11.8).
+   * Refer field localMemoryTotal_v2
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+
+  /**
+   * The ID of the HW channel on which the kernel is launched.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   * The X-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterX;
+
+  /**
+   * The Y-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterY;
+
+  /**
+   * The Z-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterZ;
+
+  /**
+   * The cluster scheduling policy for the kernel. Refer CUclusterSchedulingPolicy
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterSchedulingPolicy;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint64_t localMemoryTotal_v2;
+} CUpti_ActivityKernel8;
+
+/**
+ * \brief The activity record for memory copies. (deprecated)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemcpy;
+
+/**
+ * \brief The activity record for memory copies. (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemcpy3;
+
+/**
+ * \brief The activity record for memory copies. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemcpy4;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2) but is no longer generated
+ * by CUPTI. Peer-to-peer memory copy activities are now reported using the
+ * CUpti_ActivityMemcpyPtoP2 activity record..
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemcpyPtoP;
+
+typedef CUpti_ActivityMemcpyPtoP CUpti_ActivityMemcpy2;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ * (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemcpyPtoP2;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ * (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemcpyPtoP3;
+
+/**
+ * \brief The activity record for memset. (deprecated)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemset;
+
+/**
+ * \brief The activity record for memset. (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemset2;
+
+/**
+ * \brief The activity record for memset. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemset3;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY2).
+ * This activity record provides separate records for memory allocation and
+ * memory release operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory operation.
+ *
+ * Note: This activity record is an upgrade over \ref CUpti_ActivityMemory
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY.
+ * \ref CUpti_ActivityMemory provides a single record for the memory
+ * allocation and memory release operations.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType.
+   */
+  CUpti_ActivityMemoryOperationType memoryOperationType;
+
+  /**
+   * The memory kind requested by the user, \ref CUpti_ActivityMemoryKind.
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The correlation ID of the memory operation. Each memory operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The program counter of the memory operation.
+   */
+  uint64_t PC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory operation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID.
+   */
+  uint32_t streamId;
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+
+  /**
+   * \p isAsync is set if memory operation happens through async memory APIs.
+   */
+  uint32_t isAsync;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /**
+   * The memory pool configuration used for the memory operations.
+   */
+  struct {
+    /**
+     * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+     */
+    CUpti_ActivityMemoryPoolType memoryPoolType;
+
+#ifdef CUPTILP64
+    /**
+     * Undefined. Reserved for internal use.
+     */
+    uint32_t pad2;
+#endif
+
+    /**
+     * The base address of the memory pool.
+     */
+    uint64_t address;
+
+    /**
+     * The release threshold of the memory pool in bytes. \p releaseThreshold is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t releaseThreshold;
+
+   /**
+   * The size of the memory pool in bytes and the processID of the memory pool.
+   * \p size is valid if \p memoryPoolType is
+   * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   * \p processId is valid if \p memoryPoolType is
+   * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType.
+   */
+   union {
+      uint64_t size;
+      uint64_t processId;
+    } pool;
+  } memoryPoolConfig;
+
+} CUpti_ActivityMemory2;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY2).
+ * This activity record provides separate records for memory allocation and
+ * memory release operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory operation.
+ *
+ * Note: This activity record is an upgrade over \ref CUpti_ActivityMemory2
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY.
+ * \ref CUpti_ActivityMemory provides a single record for the memory
+ * allocation and memory release operations.
+ */
+
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType.
+   */
+  CUpti_ActivityMemoryOperationType memoryOperationType;
+
+  /**
+   * The memory kind requested by the user, \ref CUpti_ActivityMemoryKind.
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The correlation ID of the memory operation. Each memory operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The program counter of the memory operation.
+   */
+  uint64_t PC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory operation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID.
+   */
+  uint32_t streamId;
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+
+  /**
+   * \p isAsync is set if memory operation happens through async memory APIs.
+   */
+  uint32_t isAsync;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /**
+   * The memory pool configuration used for the memory operations.
+   */
+  struct PACKED_ALIGNMENT {
+    /**
+     * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+     */
+    CUpti_ActivityMemoryPoolType memoryPoolType;
+
+#ifdef CUPTILP64
+    /**
+     * Undefined. Reserved for internal use.
+     */
+    uint32_t pad2;
+#endif
+
+    /**
+     * The base address of the memory pool.
+     */
+    uint64_t address;
+
+    /**
+     * The release threshold of the memory pool in bytes. \p releaseThreshold is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t releaseThreshold;
+
+    /**
+     * The size of memory pool in bytes and the processId of the memory pools
+     * \p size is valid if \p memoryPoolType is
+     * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     * \p processId is valid if \p memoryPoolType is
+     * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType
+     */
+    union {
+      uint64_t size;
+      uint64_t processId;
+    } pool;
+
+    /**
+     * The utilized size of the memory pool. \p utilizedSize is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t utilizedSize;
+  } memoryPoolConfig;
+
+} CUpti_ActivityMemory3;
+
+/**
+ * \brief The activity record for memory pool.
+ *
+ * This activity record represents a memory pool creation, destruction and
+ * trimming (CUPTI_ACTIVITY_KIND_MEMORY_POOL).
+ * This activity record provides separate records for memory pool creation,
+ * destruction and trimming operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory pool operation.
+ *
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY_POOL
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryPoolOperationType.
+   */
+  CUpti_ActivityMemoryPoolOperationType memoryPoolOperationType;
+
+  /**
+   * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+   */
+  CUpti_ActivityMemoryPoolType memoryPoolType;
+
+  /**
+   * The correlation ID of the memory pool operation. Each memory pool
+   * operation is assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory pool is created.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The minimum bytes to keep of the memory pool. \p minBytesToKeep is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED,
+   * \ref CUpti_ActivityMemoryPoolOperationType
+   */
+  size_t minBytesToKeep;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The size of the memory pool operation in bytes. \p size is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t size;
+
+  /**
+   * The release threshold of the memory pool. \p releaseThreshold is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t releaseThreshold;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+} CUpti_ActivityMemoryPool;
+
+/**
+ * \brief The activity record providing a marker which is an
+ * instantaneous point in time. (deprecated in CUDA 8.0)
+ *
+ * The marker is specified with a descriptive name and unique id
+ * (CUPTI_ACTIVITY_KIND_MARKER).
+ * Marker activity is now reported using the
+ * CUpti_ActivityMarker2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the marker. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The timestamp for the marker, in ns. A value of 0 indicates that
+   * timestamp information could not be collected for the marker.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The marker ID.
+   */
+  uint32_t id;
+
+  /**
+   * The kind of activity object associated with this marker.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object associated with this
+   * marker. 'objectKind' indicates which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The marker name for an instantaneous or start marker. This will
+   * be NULL for an end marker.
+   */
+  const char *name;
+
+} CUpti_ActivityMarker;
+
+/**
+ * \brief The activity record for source-level global
+ * access. (deprecated)
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ * Global access activities are now reported using the
+ * CUpti_ActivityGlobalAccess3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this access
+   */
+  uint64_t l2_transactions;
+} CUpti_ActivityGlobalAccess;
+
+/**
+ * \brief The activity record for source-level global
+ * access. (deprecated in CUDA 9.0)
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ * Global access activities are now reported using the
+ * CUpti_ActivityGlobalAccess3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this access
+   */
+  uint64_t l2_transactions;
+
+  /**
+   * The minimum number of L2 transactions possible based on the access pattern.
+   */
+  uint64_t theoreticalL2Transactions;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityGlobalAccess2;
+
+/**
+ * \brief The activity record for source level result
+ * branch. (deprecated)
+ *
+ * This activity record the locations of the branches in the
+ * source (CUPTI_ACTIVITY_KIND_BRANCH).
+ * Branch activities are now reported using the
+ * CUpti_ActivityBranch2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_BRANCH.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The pc offset for the branch.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * regardless of predicate or condition code.
+   */
+  uint32_t executed;
+
+  /**
+   * Number of times this branch diverged
+   */
+  uint32_t diverged;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction
+   */
+  uint64_t threadsExecuted;
+} CUpti_ActivityBranch;
+
+/**
+ * \brief The activity record for PC sampling. (deprecated in CUDA 8.0)
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ * PC sampling activities are now reported using the
+ * CUpti_ActivityPCSampling2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+} CUpti_ActivityPCSampling;
+
+/**
+ * \brief The activity record for PC sampling. (deprecated in CUDA 9.0)
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ * PC sampling activities are now reported using the
+ * CUpti_ActivityPCSampling3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * These samples indicate that no instruction was issued in that cycle from
+   * the warp scheduler from where the warp was sampled.
+   * Field is valid for devices with compute capability 6.0 and higher
+   */
+  uint32_t latencySamples;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons. The count includes
+   * latencySamples.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+
+  uint32_t pad;
+} CUpti_ActivityPCSampling2;
+
+/**
+ * \brief The activity record for Unified Memory counters (deprecated in CUDA 7.0)
+ *
+ * This activity record represents a Unified Memory counter
+ * (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The Unified Memory counter kind. See \ref CUpti_ActivityUnifiedMemoryCounterKind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind counterKind;
+
+  /**
+   * Scope of the Unified Memory counter. See \ref CUpti_ActivityUnifiedMemoryCounterScope
+   */
+  CUpti_ActivityUnifiedMemoryCounterScope scope;
+
+  /**
+   * The ID of the device involved in the memory transfer operation.
+   * It is not relevant if the scope of the counter is global (all devices).
+   */
+  uint32_t deviceId;
+
+  /**
+   * Value of the counter
+   *
+   */
+  uint64_t value;
+
+  /**
+   * The timestamp when this sample was retrieved, in ns. A value of 0
+   * indicates that timestamp information could not be collected
+   */
+  uint64_t timestamp;
+
+  /**
+   * The ID of the process to which this record belongs to. In case of
+   * global scope, processId is undefined.
+   */
+  uint32_t processId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityUnifiedMemoryCounter;
+
+/**
+ * \brief The activity record for Unified Memory counters (deprecated in 12.8)
+ *
+ * This activity record represents a Unified Memory counter
+ * (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The Unified Memory counter kind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind counterKind;
+
+  /**
+   * Value of the counter
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD,
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH,
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THREASHING and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP, it is the size of the
+   * memory region in bytes.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, it
+   * is the number of page fault groups for the same page.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT,
+   * it is the program counter for the instruction that caused fault.
+   */
+  uint64_t value;
+
+  /**
+   * The start timestamp of the counter, in ns.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is
+   * captured when activity starts on GPU.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT, timestamp is
+   * captured when CUDA driver started processing the fault.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, timestamp
+   * is captured when CUDA driver detected thrashing of memory region.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING,
+   * timestamp is captured when throttling operation was started by CUDA driver.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP,
+   * timestamp is captured when CUDA driver has pushed all required operations
+   * to the processor specified by dstId.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp of the counter, in ns.
+   * Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is
+   * captured when activity finishes on GPU.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, timestamp is
+   * captured when CUDA driver queues the replay of faulting memory accesses on the GPU
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING, timestamp
+   * is captured when throttling operation was finished by CUDA driver
+   */
+  uint64_t end;
+
+  /**
+   * This is the virtual base address of the page/s being transferred. For cpu and
+   * gpu faults, the virtual address for the page that faulted.
+   */
+  uint64_t address;
+
+  /**
+   * The ID of the source CPU/device involved in the memory transfer, page fault, thrashing,
+   * throttling or remote map operation. For counterKind
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, it is a bitwise ORing of the
+   * device IDs fighting for the memory region. Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT
+   */
+  uint32_t srcId;
+
+  /**
+   * The ID of the destination CPU/device involved in the memory transfer or remote map
+   * operation. Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING
+   */
+  uint32_t dstId;
+
+  /**
+   * The ID of the stream causing the transfer.
+   * This value of this field is invalid.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The flags associated with this record. See enums \ref CUpti_ActivityUnifiedMemoryAccessType
+   * if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT
+   * and \ref CUpti_ActivityUnifiedMemoryMigrationCause if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD
+   * and \ref CUpti_ActivityUnifiedMemoryRemoteMapCause if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP and \ref CUpti_ActivityFlag
+   * if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING
+   */
+  uint32_t flags;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityUnifiedMemoryCounter2;
+
+/**
+* \brief NVLink information. (deprecated in CUDA 9.0)
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NVLink information are now reported using the
+* CUpti_ActivityNvLink2 activity record.
+*/
+typedef struct PACKED_ALIGNMENT {
+  /**
+  * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+  */
+  CUpti_ActivityKind kind;
+
+  /**
+  * NVLink version.
+  */
+  uint32_t nvlinkVersion;
+
+  /**
+  * Type of device 0 \ref CUpti_DevType
+  */
+  CUpti_DevType typeDev0;
+
+  /**
+  * Type of device 1 \ref CUpti_DevType
+  */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+      * Index of the NPU. First index will always be zero.
+      */
+      uint32_t index;
+
+      /**
+      * Domain ID of NPU. On Linux, this can be queried using lspci.
+      */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+      * Index of the NPU. First index will always be zero.
+      */
+      uint32_t index;
+
+      /**
+      * Domain ID of NPU. On Linux, this can be queried using lspci.
+      */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+  * Flag gives capabilities of the link \see CUpti_LinkFlag
+  */
+  uint32_t flag;
+
+  /**
+  * Number of physical NVLinks present between two devices.
+  */
+  uint32_t physicalNvLinkCount;
+
+  /**
+  * Port numbers for maximum 4 NVLinks connected to device 0.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+  * In case of invalid/unknown port number, this field will be set
+  * to value CUPTI_NVLINK_INVALID_PORT.
+  * This will be used to correlate the metric values to individual
+  * physical link and attribute traffic to the logical NVLink in
+  * the topology.
+  */
+  int8_t portDev0[4];
+
+  /**
+  * Port numbers for maximum 4 NVLinks connected to device 1.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+  * In case of invalid/unknown port number, this field will be set
+  * to value CUPTI_NVLINK_INVALID_PORT.
+  * This will be used to correlate the metric values to individual
+  * physical link and attribute traffic to the logical NVLink in
+  * the topology.
+  */
+  int8_t portDev1[4];
+
+  /**
+  * Bandwidth of NVLink in kbytes/sec
+  */
+  uint64_t bandwidth;
+} CUpti_ActivityNvLink;
+
+/**
+* \brief NVLink information. (deprecated in CUDA 10.0)
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NvLink information are now reported using the
+* CUpti_ActivityNvLink4 activity record.
+*/
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * NvLink version.
+   */
+  uint32_t nvlinkVersion;
+
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t physicalNvLinkCount;
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev0[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev1[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Bandwidth of NVLink in kbytes/sec
+   */
+  uint64_t  bandwidth;
+} CUpti_ActivityNvLink2;
+
+/**
+* \brief NVLink information.
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NvLink information are now reported using the
+* CUpti_ActivityNvLink4 activity record.
+*/
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+  /**
+   * NvLink version.
+   */
+  uint32_t nvlinkVersion;
+
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t physicalNvLinkCount;
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev0[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev1[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Bandwidth of NVLink in kbytes/sec
+   */
+  uint64_t bandwidth;
+
+  /**
+   * NVSwitch is connected as an intermediate node.
+   */
+  uint8_t nvswitchConnected;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[7];
+} CUpti_ActivityNvLink3;
+
+/**
+ * \brief The activity record for trace of graph execution.
+ *
+ * This activity record represents execution for a graph without giving visibility
+ * about the execution of its nodes. This is intended to reduce overheads in tracing
+ * each node. The activity kind is CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+ * Graph trace activity is now reported using CUpti_ActivityGraphTrace2 record.
+ */
+typedef struct {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the graph launch. Each graph launch is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the graph.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The start timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the graph execution is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The unique ID of the graph that is launched.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the context where the graph is being launched.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the graph is being launched.
+   */
+  uint32_t streamId;
+
+  /**
+   * This field is reserved for internal use
+   */
+  void *reserved;
+} CUpti_ActivityGraphTrace;
+
+/**
+ * \brief The activity record for a context.
+ *
+ * This activity record represents information about a context
+ * (CUPTI_ACTIVITY_KIND_CONTEXT).
+ * Context activity is now reported using CUpti_ActivityContext3 record
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CONTEXT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The context ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The compute API kind. \see CUpti_ActivityComputeApiKind
+   */
+  uint16_t computeApiKind;
+
+  /**
+   * The ID for the NULL stream in this context
+   */
+  uint16_t nullStreamId;
+} CUpti_ActivityContext;
+
+/**
+ * \brief The activity record for a context.
+ *
+ * This activity record represents information about a context
+ * (CUPTI_ACTIVITY_KIND_CONTEXT).
+ * Context activity is now reported using CUpti_ActivityContext3 record
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CONTEXT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The context ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The compute API kind. \see CUpti_ActivityComputeApiKind
+   */
+  uint16_t computeApiKind;
+
+  /**
+   * The ID for the NULL stream in this context
+   */
+  uint16_t nullStreamId;
+
+  /**
+   * The ID of the parent context. It would be 0 if
+   * context does not have parent
+   */
+  uint32_t parentContextId;
+
+  /**
+   * This field indicates whether the context is a green context
+   */
+  uint8_t isGreenContext;
+
+  uint8_t padding;
+
+  /**
+   * Number of multiprocessors assigned to the green context
+   * Invalid if the field 'isGreenContext' is 0
+   */
+  uint16_t numMultiprocessors;
+} CUpti_ActivityContext2;
+
+/**
+ * \brief The activity record for JIT operations.
+ * This activity represents the JIT operations (compile, load, store) of a CUmodule
+ * from the Compute Cache.
+ * Gives the exact hashed path of where the cached module is loaded from,
+ * or where the module will be stored after Just-In-Time (JIT) compilation.
+ *
+ * JIT activity is now reported using CUpti_ActivityJit2 record
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind must be CUPTI_ACTIVITY_KIND_JIT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+    * The JIT entry type.
+    */
+  CUpti_ActivityJitEntryType jitEntryType;
+
+  /**
+   * The JIT operation type.
+   */
+  CUpti_ActivityJitOperationType jitOperationType;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The start timestamp for the JIT operation, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the JIT operation.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the JIT operation, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the JIT operation.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the JIT operation to which
+   * records belong to. Each JIT operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the JIT operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Internal use.
+   */
+  uint32_t padding;
+
+  /**
+   * The correlation ID to correlate JIT compilation, load and store operations.
+   * Each JIT compilation unit is assigned a unique correlation ID
+   * at the time of the JIT compilation. This correlation id can be used
+   * to find the matching JIT cache load/store records.
+   */
+  uint64_t jitOperationCorrelationId;
+
+  /**
+   * The size of compute cache.
+   */
+  uint64_t cacheSize;
+
+  /**
+   * The path where the fat binary is cached.
+   */
+  const char* cachePath;
+} CUpti_ActivityJit;
+
+/**
+ * \brief The activity record for CUDA event.
+ *
+ * This activity is used to track recorded events.
+ * (CUPTI_ACTIVITY_KIND_CUDA_EVENT).
+ *
+ * Structure deprecated in CUDA 12.8: Refer to CUpti_ActivityCudaEvent2
+ * for the latest structure.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CUDA_EVENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the context where the event was recorded.
+   */
+  uint32_t contextId;
+
+  /**
+   * The compute stream where the event was recorded.
+   */
+  uint32_t streamId;
+
+  /**
+   * A unique event ID to identify the event record.
+   */
+  uint32_t eventId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityCudaEvent;
+
+/**
+ * \brief The activity record for synchronization management.
+ *
+ * This activity is used to track various CUDA synchronization APIs.
+ * (CUPTI_ACTIVITY_KIND_SYNCHRONIZATION).
+ *
+ * Structure deprecated in CUDA 12.8: Refer to CUpti_ActivitySynchronization2
+ * for the latest structure.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_SYNCHRONIZATION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The type of record.
+   */
+  CUpti_ActivitySynchronizationType type;
+
+  /**
+   * The start timestamp for the function, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the function.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the function, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the function.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the context for which the synchronization API is called.
+   * In case of context synchronization API it is the context id for which the API is called.
+   * In case of stream/event synchronization it is the ID of the context where the stream/event was created.
+   */
+  uint32_t contextId;
+
+  /**
+   * The compute stream for which the synchronization API is called.
+   * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record.
+   * Not valid for cuCtxSynchronize, cuEventSynchronize.
+   */
+  uint32_t streamId;
+
+  /**
+   * The event ID for which the synchronization API is called.
+   * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record.
+   * Not valid for cuCtxSynchronize, cuStreamSynchronize.
+   */
+  uint32_t cudaEventId;
+} CUpti_ActivitySynchronization;
+
+/**
+ * \brief The activity record for memory copies.
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ *
+ * Structure deprecated in CUDA 12.8: Refer to CUpti_ActivityMemcpy6
+ * for the latest structure.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the HW channel on which the memory copy is occurring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   *  Reserved for internal use.
+   */
+  uint32_t pad2;
+} CUpti_ActivityMemcpy5;
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_ACTIVITY_DEPRECATED_H_*/
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..96d228c4df3c1f090a4979bfe10132e080042fef
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_common.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+ 
+#if !defined(__CUPTI_COMMON_H__)
+#define __CUPTI_COMMON_H__
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#ifndef CUPTIUTILAPI
+#ifdef _WIN32
+#define CUPTIUTILAPI __stdcall
+#else
+#define CUPTIUTILAPI
+#endif
+#endif
+
+#if defined(__LP64__)
+#define CUPTILP64 1
+#elif defined(_WIN64)
+#define CUPTILP64 1
+#else
+#undef CUPTILP64
+#endif
+
+#define ACTIVITY_RECORD_ALIGNMENT 8
+#if defined(_WIN32) // Windows 32- and 64-bit
+#define START_PACKED_ALIGNMENT __pragma(pack(push,1)) // exact fit - no padding
+#define PACKED_ALIGNMENT __declspec(align(ACTIVITY_RECORD_ALIGNMENT))
+#define END_PACKED_ALIGNMENT __pragma(pack(pop))
+#elif defined(__GNUC__) // GCC
+#define START_PACKED_ALIGNMENT
+#define PACKED_ALIGNMENT __attribute__ ((__packed__)) __attribute__ ((aligned (ACTIVITY_RECORD_ALIGNMENT)))
+#define END_PACKED_ALIGNMENT
+#else // all other compilers
+#define START_PACKED_ALIGNMENT
+#define PACKED_ALIGNMENT
+#define END_PACKED_ALIGNMENT
+#endif
+
+#endif /*__CUPTI_COMMON_H__*/
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_metrics.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_metrics.h
new file mode 100644
index 0000000000000000000000000000000000000000..64b7f2d14580320f1ec938da5ea356add191ec3c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_metrics.h
@@ -0,0 +1,824 @@
+/*
+ * Copyright 2011-2024   NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_METRIC_H_)
+#define _CUPTI_METRIC_H_
+
+#include <cuda.h>
+#include <string.h>
+#include <cuda_stdint.h>
+#include <cupti_result.h>
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_METRIC_API CUPTI Metric API
+ * Functions, types, and enums that implement the CUPTI Metric API.
+ *
+ * \note The CUPTI metric API from the header cupti_metrics.h is not supported on devices
+ * with compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
+ * This API is deprecated in CUDA 12.8 release and will be removed in a future CUDA release.
+ * This is replaced by the host profiling API in the header cupti_profiler_host.h and
+ * target profiling API in the header cupti_range_profiler.h which are supported on
+ * devices with compute capability 7.0 and higher (i.e. Volta and later GPU architectures).
+ *
+ * @{
+ */
+
+/**
+ * \brief ID for a metric.
+ *
+ * A metric provides a measure of some aspect of the device.
+ */
+typedef uint32_t CUpti_MetricID;
+
+/**
+ * \brief A metric category.
+ *
+ * Each metric is assigned to a category that represents the general
+ * type of the metric. A metric's category is accessed using \ref
+ * cuptiMetricGetAttribute and the CUPTI_METRIC_ATTR_CATEGORY
+ * attribute.
+ */
+typedef enum {
+  /**
+   * A memory related metric.
+   */
+  CUPTI_METRIC_CATEGORY_MEMORY          = 0,
+  /**
+   * An instruction related metric.
+   */
+  CUPTI_METRIC_CATEGORY_INSTRUCTION     = 1,
+  /**
+   * A multiprocessor related metric.
+   */
+  CUPTI_METRIC_CATEGORY_MULTIPROCESSOR  = 2,
+  /**
+   * A cache related metric.
+   */
+  CUPTI_METRIC_CATEGORY_CACHE           = 3,
+  /**
+   * A texture related metric.
+   */
+  CUPTI_METRIC_CATEGORY_TEXTURE         = 4,
+  /**
+   *A Nvlink related metric.
+   */
+  CUPTI_METRIC_CATEGORY_NVLINK          = 5,
+  /**
+   *A PCIe related metric.
+   */
+  CUPTI_METRIC_CATEGORY_PCIE           = 6,
+  CUPTI_METRIC_CATEGORY_FORCE_INT                         = 0x7fffffff,
+} CUpti_MetricCategory;
+
+/**
+ * \brief A metric evaluation mode.
+ *
+ * A metric can be evaluated per hardware instance to know the load balancing
+ * across instances of a domain or the metric can be evaluated in aggregate mode
+ * when the events involved in metric evaluation are from different event
+ * domains. It might be possible to evaluate some metrics in both
+ * modes for convenience. A metric's evaluation mode is accessed using \ref
+ * CUpti_MetricEvaluationMode and the CUPTI_METRIC_ATTR_EVALUATION_MODE
+ * attribute.
+ */
+typedef enum {
+  /**
+   * If this bit is set, the metric can be profiled for each instance of the
+   * domain. The event values passed to \ref cuptiMetricGetValue can contain
+   * values for one instance of the domain. And \ref cuptiMetricGetValue can
+   * be called for each instance.
+   */
+  CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE         = 1,
+  /**
+   * If this bit is set, the metric can be profiled over all instances. The
+   * event values passed to \ref cuptiMetricGetValue can be aggregated values
+   * of events for all instances of the domain.
+   */
+  CUPTI_METRIC_EVALUATION_MODE_AGGREGATE            = 1 << 1,
+  CUPTI_METRIC_EVALUATION_MODE_FORCE_INT            = 0x7fffffff,
+} CUpti_MetricEvaluationMode;
+
+/**
+ * \brief Kinds of metric values.
+ *
+ * Metric values can be one of several different kinds. Corresponding
+ * to each kind is a member of the CUpti_MetricValue union. The metric
+ * value returned by \ref cuptiMetricGetValue should be accessed using
+ * the appropriate member of that union based on its value kind.
+ */
+typedef enum {
+  /**
+   * The metric value is a 64-bit double.
+   */
+  CUPTI_METRIC_VALUE_KIND_DOUBLE            = 0,
+  /**
+   * The metric value is a 64-bit unsigned integer.
+   */
+  CUPTI_METRIC_VALUE_KIND_UINT64            = 1,
+  /**
+   * The metric value is a percentage represented by a 64-bit
+   * double. For example, 57.5% is represented by the value 57.5.
+   */
+  CUPTI_METRIC_VALUE_KIND_PERCENT           = 2,
+  /**
+   * The metric value is a throughput represented by a 64-bit
+   * integer. The unit for throughput values is bytes/second.
+   */
+  CUPTI_METRIC_VALUE_KIND_THROUGHPUT        = 3,
+  /**
+   * The metric value is a 64-bit signed integer.
+   */
+  CUPTI_METRIC_VALUE_KIND_INT64             = 4,
+  /**
+   * The metric value is a utilization level, as represented by
+   * CUpti_MetricValueUtilizationLevel.
+   */
+  CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL = 5,
+
+  CUPTI_METRIC_VALUE_KIND_FORCE_INT  = 0x7fffffff
+} CUpti_MetricValueKind;
+
+/**
+ * \brief Enumeration of utilization levels for metrics values of kind
+ * CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL. Utilization values can
+ * vary from IDLE (0) to MAX (10) but the enumeration only provides
+ * specific names for a few values.
+ */
+typedef enum {
+  CUPTI_METRIC_VALUE_UTILIZATION_IDLE      = 0,
+  CUPTI_METRIC_VALUE_UTILIZATION_LOW       = 2,
+  CUPTI_METRIC_VALUE_UTILIZATION_MID       = 5,
+  CUPTI_METRIC_VALUE_UTILIZATION_HIGH      = 8,
+  CUPTI_METRIC_VALUE_UTILIZATION_MAX       = 10,
+  CUPTI_METRIC_VALUE_UTILIZATION_FORCE_INT = 0x7fffffff
+} CUpti_MetricValueUtilizationLevel;
+
+/**
+ * \brief Metric attributes.
+ *
+ * Metric attributes describe properties of a metric. These attributes
+ * can be read using \ref cuptiMetricGetAttribute.
+ */
+typedef enum {
+  /**
+   * Metric name. Value is a null terminated const c-string.
+   */
+  CUPTI_METRIC_ATTR_NAME              = 0,
+  /**
+   * Short description of metric. Value is a null terminated const c-string.
+   */
+  CUPTI_METRIC_ATTR_SHORT_DESCRIPTION = 1,
+  /**
+   * Long description of metric. Value is a null terminated const c-string.
+   */
+  CUPTI_METRIC_ATTR_LONG_DESCRIPTION  = 2,
+  /**
+   * Category of the metric. Value is of type CUpti_MetricCategory.
+   */
+  CUPTI_METRIC_ATTR_CATEGORY          = 3,
+  /**
+   * Value type of the metric. Value is of type CUpti_MetricValueKind.
+   */
+  CUPTI_METRIC_ATTR_VALUE_KIND          = 4,
+  /**
+   * Metric evaluation mode. Value is of type CUpti_MetricEvaluationMode.
+   */
+  CUPTI_METRIC_ATTR_EVALUATION_MODE     = 5,
+  CUPTI_METRIC_ATTR_FORCE_INT         = 0x7fffffff,
+} CUpti_MetricAttribute;
+
+/**
+ * \brief A metric value.
+ *
+ * Metric values can be one of several different kinds. Corresponding
+ * to each kind is a member of the CUpti_MetricValue union. The metric
+ * value returned by \ref cuptiMetricGetValue should be accessed using
+ * the appropriate member of that union based on its value kind.
+ */
+typedef union {
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_DOUBLE.
+   */
+  double metricValueDouble;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_UINT64.
+   */
+  uint64_t metricValueUint64;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_INT64.
+   */
+  int64_t metricValueInt64;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_PERCENT. For example, 57.5% is
+   * represented by the value 57.5.
+   */
+  double metricValuePercent;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_THROUGHPUT.  The unit for
+   * throughput values is bytes/second.
+   */
+  uint64_t metricValueThroughput;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL.
+   */
+  CUpti_MetricValueUtilizationLevel metricValueUtilizationLevel;
+} CUpti_MetricValue;
+
+/**
+ * \brief Device class.
+ *
+ * Enumeration of device classes for metric property
+ * CUPTI_METRIC_PROPERTY_DEVICE_CLASS.
+ */
+typedef enum {
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_TESLA          = 0,
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_QUADRO         = 1,
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_GEFORCE        = 2,
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_TEGRA          = 3,
+} CUpti_MetricPropertyDeviceClass;
+
+/**
+ * \brief Metric device properties.
+ *
+ * Metric device properties describe device properties which are needed for a metric.
+ * Some of these properties can be collected using cuDeviceGetAttribute.
+ */
+typedef enum {
+  /*
+   * Number of multiprocessors on a device.  This can be collected
+   * using value of \param CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_MULTIPROCESSOR_COUNT,
+  /*
+   * Maximum number of warps on a multiprocessor. This can be
+   * collected using ratio of value of \param
+   * CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR and \param
+   * CU_DEVICE_ATTRIBUTE_WARP_SIZE of cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_WARPS_PER_MULTIPROCESSOR,
+  /*
+   * GPU Time for kernel in ns. This should be profiled using CUPTI
+   * Activity API.
+   */
+  CUPTI_METRIC_PROPERTY_KERNEL_GPU_TIME,
+  /*
+   * Clock rate for device in KHz.  This should be collected using
+   * value of \param CU_DEVICE_ATTRIBUTE_CLOCK_RATE of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_CLOCK_RATE,
+  /*
+   * Number of Frame buffer units for device. This should be collected
+   * using value of \param CUPTI_DEVICE_ATTRIBUTE_MAX_FRAME_BUFFERS of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FRAME_BUFFER_COUNT,
+  /*
+   * Global memory bandwidth in KBytes/sec. This should be collected
+   * using value of \param CUPTI_DEVICE_ATTR_GLOBAL_MEMORY_BANDWIDTH
+   * of cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_GLOBAL_MEMORY_BANDWIDTH,
+  /*
+   * PCIE link rate in Mega bits/sec. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_PCIE_LINK_RATE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_PCIE_LINK_RATE,
+  /*
+   * PCIE link width for device. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_PCIE_LINK_WIDTH of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_PCIE_LINK_WIDTH,
+  /*
+   * PCIE generation for device. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_PCIE_GEN of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_PCIE_GEN,
+  /*
+   * The device class. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_DEVICE_CLASS of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS,
+  /*
+   * Peak single precision floating point operations that
+   * can be performed in one cycle by the device.
+   * This should be collected using value of
+   * \param CUPTI_DEVICE_ATTR_FLOP_SP_PER_CYCLE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FLOP_SP_PER_CYCLE,
+  /*
+   * Peak double precision floating point operations that
+   * can be performed in one cycle by the device.
+   * This should be collected using value of
+   * \param CUPTI_DEVICE_ATTR_FLOP_DP_PER_CYCLE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FLOP_DP_PER_CYCLE,
+  /*
+   * Number of L2 units on a device. This can be collected
+   * using value of \param CUPTI_DEVICE_ATTR_MAX_L2_UNITS of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_L2_UNITS,
+  /*
+   * Whether ECC support is enabled on the device. This can be
+   * collected using value of \param CU_DEVICE_ATTRIBUTE_ECC_ENABLED of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_ECC_ENABLED,
+  /*
+   * Peak half precision floating point operations that
+   * can be performed in one cycle by the device.
+   * This should be collected using value of
+   * \param CUPTI_DEVICE_ATTR_FLOP_HP_PER_CYCLE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FLOP_HP_PER_CYCLE,
+  /*
+   * NVLINK Bandwitdh for device. This should be collected
+   * using value of \param CUPTI_DEVICE_ATTR_GPU_CPU_NVLINK_BW of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_GPU_CPU_NVLINK_BANDWIDTH,
+} CUpti_MetricPropertyID;
+
+/**
+ * \brief Get the total number of metrics available on any device.
+ *
+ * Returns the total number of metrics available on any CUDA-capable
+ * devices.
+ *
+ * \param numMetrics Returns the number of metrics
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numMetrics is NULL
+*/
+CUptiResult CUPTIAPI cuptiGetNumMetrics(uint32_t *numMetrics);
+
+/**
+ * \brief Get all the metrics available on any device.
+ *
+ * Returns the metric IDs in \p metricArray for all CUDA-capable
+ * devices.  The size of the \p metricArray buffer is given by \p
+ * *arraySizeBytes. The size of the \p metricArray buffer must be at
+ * least \p numMetrics * sizeof(CUpti_MetricID) or all metric IDs will
+ * not be returned. The value returned in \p *arraySizeBytes contains
+ * the number of bytes returned in \p metricArray.
+ *
+ * \param arraySizeBytes The size of \p metricArray in bytes, and
+ * returns the number of bytes written to \p metricArray
+ * \param metricArray Returns the IDs of the metrics
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p metricArray are NULL
+*/
+CUptiResult CUPTIAPI cuptiEnumMetrics(size_t *arraySizeBytes,
+                                      CUpti_MetricID *metricArray);
+
+/**
+ * \brief Get the number of metrics for a device.
+ *
+ * Returns the number of metrics available for a device.
+ *
+ * \param device The CUDA device
+ * \param numMetrics Returns the number of metrics available for the
+ * device
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numMetrics is NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetNumMetrics(CUdevice device,
+                                              uint32_t *numMetrics);
+
+/**
+ * \brief Get the metrics for a device.
+ *
+ * Returns the metric IDs in \p metricArray for a device.  The size of
+ * the \p metricArray buffer is given by \p *arraySizeBytes. The size
+ * of the \p metricArray buffer must be at least \p numMetrics *
+ * sizeof(CUpti_MetricID) or else all metric IDs will not be
+ * returned. The value returned in \p *arraySizeBytes contains the
+ * number of bytes returned in \p metricArray.
+ *
+ * \param device The CUDA device
+ * \param arraySizeBytes The size of \p metricArray in bytes, and
+ * returns the number of bytes written to \p metricArray
+ * \param metricArray Returns the IDs of the metrics for the device
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p metricArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceEnumMetrics(CUdevice device,
+                                            size_t *arraySizeBytes,
+                                            CUpti_MetricID *metricArray);
+
+/**
+ * \brief Get a metric attribute.
+ *
+ * Returns a metric attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ *
+ * \param metric ID of the metric
+ * \param attrib The metric attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not a metric attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiMetricGetAttribute(CUpti_MetricID metric,
+                                             CUpti_MetricAttribute attrib,
+                                             size_t *valueSize,
+                                             void *value);
+
+/**
+ * \brief Find an metric by name.
+ *
+ * Find a metric by name and return the metric ID in \p *metric.
+ *
+ * \param device The CUDA device
+ * \param metricName The name of metric to find
+ * \param metric Returns the ID of the found metric or undefined if
+ * unable to find the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_METRIC_NAME if unable to find a metric
+ * with name \p metricName. In this case \p *metric is undefined
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricName or \p
+ * metric are NULL.
+ */
+CUptiResult CUPTIAPI cuptiMetricGetIdFromName(CUdevice device,
+                                              const char *metricName,
+                                              CUpti_MetricID *metric);
+
+/**
+ * \brief Get number of events required to calculate a metric.
+ *
+ * Returns the number of events in \p numEvents that are required to
+ * calculate a metric.
+ *
+ * \param metric ID of the metric
+ * \param numEvents Returns the number of events required for the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numEvents is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetNumEvents(CUpti_MetricID metric,
+                                             uint32_t *numEvents);
+
+/**
+ * \brief Get the events required to calculating a metric.
+ *
+ * Gets the event IDs in \p eventIdArray required to calculate a \p
+ * metric. The size of the \p eventIdArray buffer is given by \p
+ * *eventIdArraySizeBytes and must be at least \p numEvents *
+ * sizeof(CUpti_EventID) or all events will not be returned. The value
+ * returned in \p *eventIdArraySizeBytes contains the number of bytes
+ * returned in \p eventIdArray.
+ *
+ * \param metric ID of the metric
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes,
+ * and returns the number of bytes written to \p eventIdArray
+ * \param eventIdArray Returns the IDs of the events required to
+ * calculate \p metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventIdArraySizeBytes or \p
+ * eventIdArray are NULL.
+ */
+CUptiResult CUPTIAPI cuptiMetricEnumEvents(CUpti_MetricID metric,
+                                           size_t *eventIdArraySizeBytes,
+                                           CUpti_EventID *eventIdArray);
+
+/**
+ * \brief Get number of properties required to calculate a metric.
+ *
+ * Returns the number of properties in \p numProp that are required to
+ * calculate a metric.
+ *
+ * \param metric ID of the metric
+ * \param numProp Returns the number of properties required for the
+ * metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numProp is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetNumProperties(CUpti_MetricID metric,
+                                                 uint32_t *numProp);
+
+/**
+ * \brief Get the properties required to calculating a metric.
+ *
+ * Gets the property IDs in \p propIdArray required to calculate a \p
+ * metric. The size of the \p propIdArray buffer is given by \p
+ * *propIdArraySizeBytes and must be at least \p numProp *
+ * sizeof(CUpti_DeviceAttribute) or all properties will not be
+ * returned. The value returned in \p *propIdArraySizeBytes contains
+ * the number of bytes returned in \p propIdArray.
+ *
+ * \param metric ID of the metric
+ * \param propIdArraySizeBytes The size of \p propIdArray in bytes,
+ * and returns the number of bytes written to \p propIdArray
+ * \param propIdArray Returns the IDs of the properties required to
+ * calculate \p metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p propIdArraySizeBytes or \p
+ * propIdArray are NULL.
+ */
+CUptiResult CUPTIAPI cuptiMetricEnumProperties(CUpti_MetricID metric,
+                                               size_t *propIdArraySizeBytes,
+                                               CUpti_MetricPropertyID *propIdArray);
+
+
+/**
+ * \brief For a metric get the groups of events that must be collected
+ * in the same pass.
+ *
+ * For a metric get the groups of events that must be collected in the
+ * same pass to ensure that the metric is calculated correctly. If the
+ * events are not collected as specified then the metric value may be
+ * inaccurate.
+ *
+ * The function returns NULL if a metric does not have any required
+ * event group. In this case the events needed for the metric can be
+ * grouped in any manner for collection.
+ *
+ * \param context The context for event collection
+ * \param metric The metric ID
+ * \param eventGroupSets Returns a CUpti_EventGroupSets object that
+ * indicates the events that must be collected in the same pass to
+ * ensure the metric is calculated correctly.  Returns NULL if no
+ * grouping is required for metric
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ */
+CUptiResult CUPTIAPI cuptiMetricGetRequiredEventGroupSets(CUcontext context,
+                                                          CUpti_MetricID metric,
+                                                          CUpti_EventGroupSets **eventGroupSets);
+
+/**
+ * \brief For a set of metrics, get the grouping that indicates the
+ * number of passes and the event groups necessary to collect the
+ * events required for those metrics.
+ *
+ * For a set of metrics, get the grouping that indicates the number of
+ * passes and the event groups necessary to collect the events
+ * required for those metrics.
+ *
+ * \see cuptiEventGroupSetsCreate for details on event group set
+ * creation.
+ *
+ * \param context The context for event collection
+ * \param metricIdArraySizeBytes Size of the metricIdArray in bytes
+ * \param metricIdArray Array of metric IDs
+ * \param eventGroupPasses Returns a CUpti_EventGroupSets object that
+ * indicates the number of passes required to collect the events and
+ * the events to collect on each pass
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricIdArray or
+ * \p eventGroupPasses is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricCreateEventGroupSets(CUcontext context,
+                                                     size_t metricIdArraySizeBytes,
+                                                     CUpti_MetricID *metricIdArray,
+                                                     CUpti_EventGroupSets **eventGroupPasses);
+
+/**
+ * \brief Calculate the value for a metric.
+ *
+ * Use the events collected for a metric to calculate the metric
+ * value. Metric value evaluation depends on the evaluation mode
+ * \ref CUpti_MetricEvaluationMode that the metric supports.
+ * If a metric has evaluation mode as CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE,
+ * then it assumes that the input event value is for one domain instance.
+ * If a metric has evaluation mode as CUPTI_METRIC_EVALUATION_MODE_AGGREGATE,
+ * it assumes that input event values are
+ * normalized to represent all domain instances on a device. For the
+ * most accurate metric collection, the events required for the metric
+ * should be collected for all profiled domain instances. For example,
+ * to collect all instances of an event, set the
+ * CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES attribute on
+ * the group containing the event to 1. The normalized value for the
+ * event is then: (\p sum_event_values * \p totalInstanceCount) / \p
+ * instanceCount, where \p sum_event_values is the summation of the
+ * event values across all profiled domain instances, \p
+ * totalInstanceCount is obtained from querying
+ * CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT and \p instanceCount
+ * is obtained from querying CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT (or
+ * CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT).
+ *
+ * \param device The CUDA device that the metric is being calculated for
+ * \param metric The metric ID
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes
+ * \param eventIdArray The event IDs required to calculate \p metric
+ * \param eventValueArraySizeBytes The size of \p eventValueArray in bytes
+ * \param eventValueArray The normalized event values required to
+ * calculate \p metric. The values must be order to match the order of
+ * events in \p eventIdArray
+ * \param timeDuration The duration over which the events were
+ * collected, in ns
+ * \param metricValue Returns the value for the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_OPERATION
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if the
+ * eventIdArray does not contain all the events needed for metric
+ * \retval CUPTI_ERROR_INVALID_EVENT_VALUE if any of the
+ * event values required for the metric is CUPTI_EVENT_OVERFLOW
+ * \retval CUPTI_ERROR_INVALID_METRIC_VALUE if the computed metric value
+ * cannot be represented in the metric's value type. For example,
+ * if the metric value type is unsigned and the computed metric value is negative
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricValue,
+ * \p eventIdArray or \p eventValueArray is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetValue(CUdevice device,
+                                         CUpti_MetricID metric,
+                                         size_t eventIdArraySizeBytes,
+                                         CUpti_EventID *eventIdArray,
+                                         size_t eventValueArraySizeBytes,
+                                         uint64_t *eventValueArray,
+                                         uint64_t timeDuration,
+                                         CUpti_MetricValue *metricValue);
+
+/**
+ * \brief Calculate the value for a metric.
+ *
+ * Use the events and properties collected for a metric to calculate
+ * the metric value. Metric value evaluation depends on the evaluation
+ * mode \ref CUpti_MetricEvaluationMode that the metric supports.  If
+ * a metric has evaluation mode as
+ * CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE, then it assumes that the
+ * input event value is for one domain instance.  If a metric has
+ * evaluation mode as CUPTI_METRIC_EVALUATION_MODE_AGGREGATE, it
+ * assumes that input event values are normalized to represent all
+ * domain instances on a device. For the most accurate metric
+ * collection, the events required for the metric should be collected
+ * for all profiled domain instances. For example, to collect all
+ * instances of an event, set the
+ * CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES attribute on
+ * the group containing the event to 1. The normalized value for the
+ * event is then: (\p sum_event_values * \p totalInstanceCount) / \p
+ * instanceCount, where \p sum_event_values is the summation of the
+ * event values across all profiled domain instances, \p
+ * totalInstanceCount is obtained from querying
+ * CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT and \p instanceCount
+ * is obtained from querying CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT (or
+ * CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT).
+ *
+ * \param metric The metric ID
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes
+ * \param eventIdArray The event IDs required to calculate \p metric
+ * \param eventValueArraySizeBytes The size of \p eventValueArray in bytes
+ * \param eventValueArray The normalized event values required to
+ * calculate \p metric. The values must be order to match the order of
+ * events in \p eventIdArray
+ * \param propIdArraySizeBytes The size of \p propIdArray in bytes
+ * \param propIdArray The metric property IDs required to calculate \p metric
+ * \param propValueArraySizeBytes The size of \p propValueArray in bytes
+ * \param propValueArray The metric property values required to
+ * calculate \p metric. The values must be order to match the order of
+ * metric properties in \p propIdArray
+ * \param metricValue Returns the value for the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_OPERATION
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if the
+ * eventIdArray does not contain all the events needed for metric
+ * \retval CUPTI_ERROR_INVALID_EVENT_VALUE if any of the
+ * event values required for the metric is CUPTI_EVENT_OVERFLOW
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the computed metric value
+ * cannot be represented in the metric's value type. For example,
+ * if the metric value type is unsigned and the computed metric value is negative
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricValue,
+ * \p eventIdArray or \p eventValueArray is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetValue2(CUpti_MetricID metric,
+                                          size_t eventIdArraySizeBytes,
+                                          CUpti_EventID *eventIdArray,
+                                          size_t eventValueArraySizeBytes,
+                                          uint64_t *eventValueArray,
+                                          size_t propIdArraySizeBytes,
+                                          CUpti_MetricPropertyID *propIdArray,
+                                          size_t propValueArraySizeBytes,
+                                          uint64_t *propValueArray,
+                                          CUpti_MetricValue *metricValue);
+
+/** @} */ /* END CUPTI_METRIC_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_METRIC_H_*/
+
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_nvtx_cbid.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_nvtx_cbid.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ad8c85e6e674b9a016580be88d3c5a2d2619990
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_nvtx_cbid.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2013-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+typedef enum {
+  CUPTI_CBID_NVTX_INVALID                               = 0,
+  CUPTI_CBID_NVTX_nvtxMarkA                             = 1,
+  CUPTI_CBID_NVTX_nvtxMarkW                             = 2,
+  CUPTI_CBID_NVTX_nvtxMarkEx                            = 3,
+  CUPTI_CBID_NVTX_nvtxRangeStartA                       = 4,
+  CUPTI_CBID_NVTX_nvtxRangeStartW                       = 5,
+  CUPTI_CBID_NVTX_nvtxRangeStartEx                      = 6,
+  CUPTI_CBID_NVTX_nvtxRangeEnd                          = 7,
+  CUPTI_CBID_NVTX_nvtxRangePushA                        = 8,
+  CUPTI_CBID_NVTX_nvtxRangePushW                        = 9,
+  CUPTI_CBID_NVTX_nvtxRangePushEx                       = 10,
+  CUPTI_CBID_NVTX_nvtxRangePop                          = 11,
+  CUPTI_CBID_NVTX_nvtxNameCategoryA                     = 12,
+  CUPTI_CBID_NVTX_nvtxNameCategoryW                     = 13,
+  CUPTI_CBID_NVTX_nvtxNameOsThreadA                     = 14,
+  CUPTI_CBID_NVTX_nvtxNameOsThreadW                     = 15,
+  CUPTI_CBID_NVTX_nvtxNameCuDeviceA                     = 16,
+  CUPTI_CBID_NVTX_nvtxNameCuDeviceW                     = 17,
+  CUPTI_CBID_NVTX_nvtxNameCuContextA                    = 18,
+  CUPTI_CBID_NVTX_nvtxNameCuContextW                    = 19,
+  CUPTI_CBID_NVTX_nvtxNameCuStreamA                     = 20,
+  CUPTI_CBID_NVTX_nvtxNameCuStreamW                     = 21,
+  CUPTI_CBID_NVTX_nvtxNameCuEventA                      = 22,
+  CUPTI_CBID_NVTX_nvtxNameCuEventW                      = 23,
+  CUPTI_CBID_NVTX_nvtxNameCudaDeviceA                   = 24,
+  CUPTI_CBID_NVTX_nvtxNameCudaDeviceW                   = 25,
+  CUPTI_CBID_NVTX_nvtxNameCudaStreamA                   = 26,
+  CUPTI_CBID_NVTX_nvtxNameCudaStreamW                   = 27,
+  CUPTI_CBID_NVTX_nvtxNameCudaEventA                    = 28,
+  CUPTI_CBID_NVTX_nvtxNameCudaEventW                    = 29,
+  CUPTI_CBID_NVTX_nvtxDomainMarkEx                      = 30,
+  CUPTI_CBID_NVTX_nvtxDomainRangeStartEx                = 31,
+  CUPTI_CBID_NVTX_nvtxDomainRangeEnd                    = 32,
+  CUPTI_CBID_NVTX_nvtxDomainRangePushEx                 = 33,
+  CUPTI_CBID_NVTX_nvtxDomainRangePop                    = 34,
+  CUPTI_CBID_NVTX_nvtxDomainResourceCreate              = 35,
+  CUPTI_CBID_NVTX_nvtxDomainResourceDestroy             = 36,
+  CUPTI_CBID_NVTX_nvtxDomainNameCategoryA               = 37,
+  CUPTI_CBID_NVTX_nvtxDomainNameCategoryW               = 38,
+  CUPTI_CBID_NVTX_nvtxDomainRegisterStringA             = 39,
+  CUPTI_CBID_NVTX_nvtxDomainRegisterStringW             = 40,
+  CUPTI_CBID_NVTX_nvtxDomainCreateA                     = 41,
+  CUPTI_CBID_NVTX_nvtxDomainCreateW                     = 42,
+  CUPTI_CBID_NVTX_nvtxDomainDestroy                     = 43,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserCreate              = 44,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserDestroy             = 45,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserAcquireStart        = 46,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserAcquireFailed       = 47,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserAcquireSuccess      = 48,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserReleasing           = 49,
+  CUPTI_CBID_NVTX_SIZE,
+  CUPTI_CBID_NVTX_FORCE_INT                             = 0x7fffffff
+} CUpti_nvtx_api_trace_cbid;
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif    
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pmsampling.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pmsampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba4171b6710564b56bc7e8e64e46c3674fe6c58c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pmsampling.h
@@ -0,0 +1,490 @@
+/*
+ * Copyright 2024 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_PMSAMPLING_H_)
+#define _CUPTI_PMSAMPLING_H_
+
+#include <cuda.h>
+#include <cupti_result.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+#ifndef CUPTI_PROFILER_STRUCT_SIZE
+#define CUPTI_PROFILER_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+/* CUPTI PM sampling APIs */
+/**
+ * \defgroup CUPTI_PM_SAMPLING_API CUPTI PM Sampling API
+ * Functions to enable, disable, start, stop, and decode PM sampling.
+ * @{
+ */
+typedef struct CUpti_PmSampling_Object CUpti_PmSampling_Object;
+
+typedef enum CUpti_PmSampling_TriggerMode
+{
+    /// The trigger is based off of the SYSCLK frequency, note SYS frequency by default is variable.
+    /// the sample interval (set in the struct CUpti_PmSampling_SetConfig_Params) is in terms of clocks.
+    CUPTI_PM_SAMPLING_TRIGGER_MODE_GPU_SYSCLK_INTERVAL = 0,
+    /// The trigger is based off of a fixed frequency source.
+    /// The sample interval (set in the struct CUpti_PmSampling_SetConfig_Params) is in terms of nanoseconds.
+    /// Note: This trigger mode is not supported on Turing GPU architecture and GA100 GPU.
+    /// It is supported on Ampere GA10x and later GPU architectures.
+    CUPTI_PM_SAMPLING_TRIGGER_MODE_GPU_TIME_INTERVAL = 1,
+    CUPTI_PM_SAMPLING_TRIGGER_MODE_COUNT
+} CUpti_PmSampling_TriggerMode;
+
+typedef enum CUpti_PmSampling_DecodeStopReason
+{
+    CUPTI_PM_SAMPLING_DECODE_STOP_REASON_OTHER = 0,
+    /// Counter data image is full.
+    CUPTI_PM_SAMPLING_DECODE_STOP_REASON_COUNTER_DATA_FULL,
+    /// All the records in the hardware buffer is decoded.
+    CUPTI_PM_SAMPLING_DECODE_STOP_REASON_END_OF_RECORDS,
+    CUPTI_PM_SAMPLING_DECODE_STOP_REASON_COUNT
+} CUpti_PmSampling_DecodeStopReason;
+
+typedef enum CUpti_PmSampling_HardwareBuffer_AppendMode
+{
+    /// Keep the oldest records in the hardware buffer.
+    /// CUPTI will report error for overflow in case hardware buffer is getting filled up.
+    CUPTI_PM_SAMPLING_HARDWARE_BUFFER_APPEND_MODE_KEEP_OLDEST = 0,
+    /// Keep the latest records in the hardware buffer.
+    /// Note: This mode is not supported on Turing GPU architecture.
+    /// It is supported on Ampere and later GPU architectures.
+    CUPTI_PM_SAMPLING_HARDWARE_BUFFER_APPEND_MODE_KEEP_LATEST = 1
+} CUpti_PmSampling_HardwareBuffer_AppendMode;
+
+/**
+ * \brief Params for cuptiPmSamplingSetConfig
+ */
+typedef struct CUpti_PmSampling_SetConfig_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+    /// [in] Size of the config image.
+    size_t configSize;
+    /// [in] Config image.
+    const uint8_t* pConfig;
+    /// [in] The hardware buffer size in which raw PM sampling data
+    /// will be stored. These samples will be decoded to counter data
+    /// image with \ref cuptiPmSamplingDecodeData call.
+    size_t hardwareBufferSize;
+    /// [in] For the trigger mode `CUPTI_PM_SAMPLING_TRIGGER_MODE_GPU_SYSCLK_INTERVAL`, sampling interval
+    /// is the number of sys clock cycles. For the trigger mode `CUPTI_PM_SAMPLING_TRIGGER_MODE_GPU_TIME_INTERVAL`,
+    /// sampling interval is in nanoseconds.
+    uint64_t samplingInterval;
+    /// [in] Trigger mode.
+    /// Note: CUPTI_PM_SAMPLING_TRIGGER_MODE_GPU_TIME_INTERVAL is not supported in Turing and GA100.
+    /// Supported from GA10x onwards.
+    CUpti_PmSampling_TriggerMode triggerMode;
+    /// [in] Append mode for the records in hardware buffer.
+    /// For KEEP_OLDEST mode, all the records will be kept in the buffer and in case hardware buffer is getting filled up.
+    /// overflow will be set to 1 in \ref CUpti_PmSampling_DecodeData_Params. For KEEP_LATEST mode, the new records will
+    /// overwrite the oldest records in the buffer in case of filled buffer.
+    CUpti_PmSampling_HardwareBuffer_AppendMode hwBufferAppendMode;
+} CUpti_PmSampling_SetConfig_Params;
+
+#define CUpti_PmSampling_SetConfig_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_SetConfig_Params, hwBufferAppendMode)
+
+/**
+ * \brief Set the configuration for PM sampling like sampling interval, maximum number of samples
+ * filled in HW buffer, trigger mode and the config image which has scheduling info for metric collection.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_SetConfig_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED for config image which require multiple passes for data collection
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingSetConfig(CUpti_PmSampling_SetConfig_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingEnable
+ */
+typedef struct CUpti_PmSampling_Enable_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Device index.
+    size_t deviceIndex;
+    /// [out] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+} CUpti_PmSampling_Enable_Params;
+
+#define CUpti_PmSampling_Enable_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_Enable_Params, pPmSamplingObject)
+
+/**
+ * \brief Create a PM sampling object and enable PM sampling on the CUDA device.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_Enable_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY if memory allocation fails while creating the PM sampling object
+ * \retval CUPTI_ERROR_INVALID_OPERATION if PM sampling is already enabled on the device
+ * \retval CUPTI_ERROR_INSUFFICIENT_PRIVILEGES if the user does not have sufficient privileges to perform the operation
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingEnable(CUpti_PmSampling_Enable_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingDisable
+ */
+typedef struct CUpti_PmSampling_Disable_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+} CUpti_PmSampling_Disable_Params;
+
+#define CUpti_PmSampling_Disable_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_Disable_Params, pPmSamplingObject)
+
+/**
+ * \brief Disable PM sampling on the CUDA device and destroy the PM sampling object.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_Disable_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingDisable(CUpti_PmSampling_Disable_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingStart
+ */
+typedef struct CUpti_PmSampling_Start_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+} CUpti_PmSampling_Start_Params;
+
+#define CUpti_PmSampling_Start_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_Start_Params, pPmSamplingObject)
+
+/**
+ * \brief Start the PM sampling. The GPU will start collecting the metrics data
+ * periodically based on trigger type and sampling interval passed in CUpti_PmSampling_SetConfig_Params.
+ * The collected data will be stored in the hardware buffer.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_Start_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if PM sampling Start is called without enabling PM sampling,
+ * and PM sampling is already started
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingStart(CUpti_PmSampling_Start_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingStop
+ */
+typedef struct CUpti_PmSampling_Stop_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+} CUpti_PmSampling_Stop_Params;
+
+#define CUpti_PmSampling_Stop_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_Stop_Params, pPmSamplingObject)
+
+/**
+ * \brief Stop the PM sampling. The GPU will stop collecting the metrics data.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_Stop_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if PM sampling Stop is called without enabling PM sampling,
+ * and PM sampling is already stopped
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingStop(CUpti_PmSampling_Stop_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingDecodeData
+ */
+typedef struct CUpti_PmSampling_DecodeData_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+    /// [in] Counter data image.
+    uint8_t* pCounterDataImage;
+    /// [in] Size of the counter data image.
+    size_t counterDataImageSize;
+    /// [out] decode stop reason
+    CUpti_PmSampling_DecodeStopReason decodeStopReason;
+    /// [out] overflow status for hardware buffer.
+    /// To avoid overflow, either increase the maxSamples values in
+    /// \ref CUpti_PmSampling_SetConfig_Params or reduce the sampling interval.
+    uint8_t overflow;
+} CUpti_PmSampling_DecodeData_Params;
+
+#define CUpti_PmSampling_DecodeData_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_DecodeData_Params, overflow)
+
+/**
+ * \brief Decode the metrics data stored in the hardware buffer to the counter data image.
+ *
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_DecodeData_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if PM sampling DecodeData is called without enabling PM sampling
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY if there is record overflow in the hardware buffer
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingDecodeData(CUpti_PmSampling_DecodeData_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingGetCounterData
+ */
+typedef struct CUpti_PmSampling_GetCounterAvailability_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Device index.
+    size_t deviceIndex;
+    /// [inout] Size of the counter availability image. When pCounterAvailabilityImage is NULL,
+    /// this field is used to return the size of the counter availability image.
+    size_t counterAvailabilityImageSize;
+    /// [out] Counter availability image.
+    uint8_t* pCounterAvailabilityImage;
+} CUpti_PmSampling_GetCounterAvailability_Params;
+#define CUpti_PmSampling_GetCounterAvailability_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_GetCounterAvailability_Params, pCounterAvailabilityImage)
+
+/**
+ * \brief Query counter availibility information in a buffer which can be used to filter unavailable raw metrics on host.
+ * Note: This API may fail, if any profiling or sampling session is active on the specified device.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_GetCounterAvailability_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INSUFFICIENT_PRIVILEGES if the user does not have sufficient privileges to perform the operation
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingGetCounterAvailability(CUpti_PmSampling_GetCounterAvailability_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingGetCounterDataSize
+ */
+typedef struct CUpti_PmSampling_GetCounterDataSize_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+    /// [in] Names of the metrics to be collected.
+    const char** pMetricNames;
+    /// [in] Number of metrics to be collected.
+    size_t numMetrics;
+    /// [in] Maximum number of samples to be stored in the counter data image.
+    uint32_t maxSamples;
+    /// [out] Size of the counter data image.
+    size_t counterDataSize;
+} CUpti_PmSampling_GetCounterDataSize_Params;
+#define CUpti_PmSampling_GetCounterDataSize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_GetCounterDataSize_Params, counterDataSize)
+
+/**
+ * \brief Query the size of the counter data image which will be used to store the metrics data.
+ * User need to allocate the memory for the counter data image based on the size returned by this API.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_GetCounterDataSize_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if PM sampling GetCounterDataSize is called without enabling PM sampling
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingGetCounterDataSize(CUpti_PmSampling_GetCounterDataSize_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingCounterDataImageInitialize
+ */
+typedef struct CUpti_PmSampling_CounterDataImage_Initialize_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+    /// [in] Size of the counter data image.
+    size_t counterDataSize;
+    /// [in] Counter data image.
+    uint8_t* pCounterData;
+} CUpti_PmSampling_CounterDataImage_Initialize_Params;
+#define CUpti_PmSampling_CounterDataImage_Initialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_CounterDataImage_Initialize_Params, pCounterData)
+
+/**
+ * \brief Initialize the counter data to CUPTI record format for storing the metric data.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_CounterDataImage_Initialize_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if PM sampling CounterDataInitialize is called without enabling PM sampling
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingCounterDataImageInitialize(CUpti_PmSampling_CounterDataImage_Initialize_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingGetCounterDataInfo
+ */
+typedef struct CUpti_PmSampling_GetCounterDataInfo_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Counter data image.
+    const uint8_t* pCounterDataImage;
+    /// [in] Size of the counter data image.
+    size_t counterDataImageSize;
+    /// [out] Number of samples in the counter data image.
+    size_t numTotalSamples;
+    /// [out] Number of populated samples.
+    size_t numPopulatedSamples;
+    /// [out] Number of samples that have been completed.
+    size_t numCompletedSamples;
+} CUpti_PmSampling_GetCounterDataInfo_Params;
+#define CUpti_PmSampling_GetCounterDataInfo_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_GetCounterDataInfo_Params, numCompletedSamples)
+
+/**
+ * \brief Get the counter data info like number of samples, number of populated
+ * samples and number of completed samples in a counter data image.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_GetCounterDataInfo_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingGetCounterDataInfo(CUpti_PmSampling_GetCounterDataInfo_Params* pParams);
+
+/**
+ * \brief Params for cuptiPmSamplingCounterDataGetSampleInfo
+ */
+typedef struct CUpti_PmSampling_CounterData_GetSampleInfo_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] PM sampling object.
+    CUpti_PmSampling_Object* pPmSamplingObject;
+    /// [in] Counter data image.
+    const uint8_t* pCounterDataImage;
+    /// [in] Size of the counter data image.
+    size_t counterDataImageSize;
+    /// [in] Index of the sample.
+    size_t sampleIndex;
+    /// [out] Start time of the sample.
+    uint64_t startTimestamp;
+    /// [out] End time of the sample.
+    uint64_t endTimestamp;
+} CUpti_PmSampling_CounterData_GetSampleInfo_Params;
+#define CUpti_PmSampling_CounterData_GetSampleInfo_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_PmSampling_CounterData_GetSampleInfo_Params, endTimestamp)
+
+/**
+ * \brief Get the sample info (start and end time stamp) for the given sample index.
+ * Each sample is distinguished by the start and end time stamp.
+ *
+ * \param pParams A pointer to \ref CUpti_PmSampling_CounterData_GetSampleInfo_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiPmSamplingCounterDataGetSampleInfo(CUpti_PmSampling_CounterData_GetSampleInfo_Params* pParams);
+
+/** @} */ /* END CUPTI_PMSAMPLING_API */
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // _CUPTI_PMSAMPLING_H_
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_profiler_host.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_profiler_host.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e38ceb160791ae51fd681623d45dba1c688dda1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_profiler_host.h
@@ -0,0 +1,541 @@
+/*
+ * Copyright 2024 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_PROFILER_HOST_H_)
+#define _CUPTI_PROFILER_HOST_H_
+
+/*
+CUPTI profiler host API's
+This file contains the CUPTI profiling host API's.
+*/
+#include <cupti_result.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <string>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_PROFILER_HOST_API CUPTI Profiler Host API
+ * Functions, types, and enums that implement the CUPTI Profiler Host API.
+ * @{
+ */
+#ifndef CUPTI_PROFILER_STRUCT_SIZE
+#define CUPTI_PROFILER_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+typedef enum CUpti_MetricType
+{
+    CUPTI_METRIC_TYPE_COUNTER = 0,
+    CUPTI_METRIC_TYPE_RATIO,
+    CUPTI_METRIC_TYPE_THROUGHPUT,
+    CUPTI_METRIC_TYPE__COUNT
+} CUpti_MetricType;
+
+typedef enum CUpti_ProfilerType
+{
+    CUPTI_PROFILER_TYPE_RANGE_PROFILER,
+    CUPTI_PROFILER_TYPE_PM_SAMPLING,
+    CUPTI_PROFILER_TYPE_PROFILER_INVALID
+} CUpti_ProfilerType;
+
+typedef struct CUpti_Profiler_Host_Object CUpti_Profiler_Host_Object;
+
+/**
+ * \brief Params for cuptiProfilerHostInitialize
+ */
+typedef struct CUpti_Profiler_Host_Initialize_Params
+{
+    /// [in] Size of the data structure.
+    /// CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+    /// available in the structure. Used to preserve backward compatibility.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] the profiler kind one from CUpti_ProfilerType
+    CUpti_ProfilerType profilerType;
+    /// [in] accepted for chips supported at the time-of-release.
+    const char* pChipName;
+    /// [in] buffer with counter availability image - required for future chip support
+    const uint8_t* pCounterAvailabilityImage;
+    /// [out] binary blob allocated by CUPTI and operations associated with this object.
+    CUpti_Profiler_Host_Object* pHostObject;
+} CUpti_Profiler_Host_Initialize_Params;
+
+#define CUpti_Profiler_Host_Initialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_Initialize_Params, pHostObject)
+
+/**
+ * \brief Create and initialize the profiler host object (CUpti_Profiler_Host_Object).
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_Initialize_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostInitialize(CUpti_Profiler_Host_Initialize_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostDeinitialize
+ */
+typedef struct CUpti_Profiler_Host_Deinitialize_Params
+{
+    /// [in] Size of the data structure.
+    /// CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+    /// available in the structure. Used to preserve backward compatibility.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    struct CUpti_Profiler_Host_Object* pHostObject;
+} CUpti_Profiler_Host_Deinitialize_Params;
+
+#define CUpti_Profiler_Host_Deinitialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_Deinitialize_Params, pHostObject)
+
+/**
+ * \brief Deinitialize and destroy the profiler host object (CUpti_Profiler_Host_Object).
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_Deinitialize_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostDeinitialize(CUpti_Profiler_Host_Deinitialize_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetSupportedChips
+ */
+typedef struct CUpti_Profiler_Host_GetSupportedChips_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [out] number of supported chips
+    size_t numChips;
+    /// [out] list of supported chips
+    const char* const* ppChipNames;
+} CUpti_Profiler_Host_GetSupportedChips_Params;
+
+#define CUpti_Profiler_Host_GetSupportedChips_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetSupportedChips_Params, ppChipNames)
+
+/**
+ * \brief Get the list of supported chips.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetSupportedChips_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetSupportedChips(CUpti_Profiler_Host_GetSupportedChips_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetSupportedMetrics
+ */
+typedef struct CUpti_Profiler_Host_GetBaseMetrics_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    struct CUpti_Profiler_Host_Object* pHostObject;
+    /// [in] metric type (counter, ratio, throughput)
+    CUpti_MetricType metricType;
+    /// [out] list of base metrics supported of queried metric type for the chip
+    const char** ppMetricNames;
+    /// [out] number of metrics
+    size_t numMetrics;
+} CUpti_Profiler_Host_GetBaseMetrics_Params;
+
+#define CUpti_Profiler_Host_GetBaseMetrics_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetBaseMetrics_Params, numMetrics)
+
+/**
+ * \brief Get the list of supported base metrics for the chip.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetBaseMetrics_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetBaseMetrics(CUpti_Profiler_Host_GetBaseMetrics_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetSubMetrics
+ */
+typedef struct CUpti_Profiler_Host_GetSubMetrics_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    CUpti_Profiler_Host_Object* pHostObject;
+    /// [in] the metric type for queried metric
+    CUpti_MetricType metricType;
+    /// [in] metric name for which sub-metric will be listed
+    const char* pMetricName;
+    /// [out] number of submetrics supported
+    size_t numOfSubmetrics;
+    /// [out] list of submetrics supported for the metric.
+    const char** ppSubMetrics;
+} CUpti_Profiler_Host_GetSubMetrics_Params;
+
+#define CUpti_Profiler_Host_GetSubMetrics_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetSubMetrics_Params, ppSubMetrics)
+
+/**
+ * \brief Get the list of supported sub-metrics for the metric.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetSubMetrics_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_METRIC_NAME if the metric name is not valid or not supported for the chip
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetSubMetrics(CUpti_Profiler_Host_GetSubMetrics_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetMetricProperties
+ */
+typedef struct CUpti_Profiler_Host_GetMetricProperties_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    CUpti_Profiler_Host_Object* pHostObject;
+    /// [in] metric name for which its properties will be listed
+    const char* pMetricName;
+    /// [out] a short description about the metric
+    const char* pDescription;
+    /// [out] associated hw unit for the metric
+    const char* pHwUnit;
+    /// [out] the dimension of the metric values
+    const char* pDimUnit;
+    /// [out] the metric type (counter, ratio or throughput)
+    CUpti_MetricType metricType;
+} CUpti_Profiler_Host_GetMetricProperties_Params;
+
+#define CUpti_Profiler_Host_GetMetricProperties_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetMetricProperties_Params, metricType)
+
+/**
+ * \brief Get the properties of the metric.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetMetricProperties_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_METRIC_NAME if the metric name is not valid or not supported for the chip
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetMetricProperties(CUpti_Profiler_Host_GetMetricProperties_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetRangeName
+ */
+typedef struct CUpti_Profiler_Host_GetRangeName_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] the counter data image where profiling data has been decoded
+    const uint8_t* pCounterDataImage;
+    /// [in] size of counter data image
+    size_t counterDataImageSize;
+    /// [in] range index for which the range name will be queried
+    size_t rangeIndex;
+    /// [in] used in case of nested ranges, default="/". Range1<delimiter>Range2 
+    const char* delimiter;
+    /// [out] the range name.
+    /// Note: that the CUPTI allocate the memory internal and
+    /// its user responsibility to free up the allocated memory
+    const char* pRangeName;
+} CUpti_Profiler_Host_GetRangeName_Params;
+
+#define CUpti_Profiler_Host_GetRangeName_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetRangeName_Params, pRangeName)
+
+/**
+ * \brief Get the range name for the range index stored in the counter data.
+ * In Range profiler, for Auto range mode the range name will be numeric value
+ * assigned to the kernel based on execution order. For user range mode, the 
+ * name of range will be based on the range name provided by the user using
+ * Push range API.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetRangeName_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetRangeName(CUpti_Profiler_Host_GetRangeName_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostEvaluateToGpuValues
+ */
+typedef struct CUpti_Profiler_Host_EvaluateToGpuValues_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    CUpti_Profiler_Host_Object* pHostObject;
+    /// [in] the counter data image where profiling data has been decoded
+    const uint8_t* pCounterDataImage;
+    /// [in] size of counter data image
+    size_t counterDataImageSize;
+    /// [in] range index for which the range name will be queried
+    size_t rangeIndex;
+    /// [in] the metrics for which GPU values will be evaluated for the range
+    const char** ppMetricNames;
+    /// [in] number of metrics
+    size_t numMetrics;
+    /// [out] output value for given metric and range index
+    double* pMetricValues;
+} CUpti_Profiler_Host_EvaluateToGpuValues_Params;
+
+#define CUpti_Profiler_Host_EvaluateToGpuValues_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_EvaluateToGpuValues_Params, pMetricValues)
+
+/**
+ * \brief Evaluate the metric values for the range index stored in the counter data.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_EvaluateToGpuValues_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_METRIC_NAME if the metric name is not valid or not supported for the chip
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostEvaluateToGpuValues(CUpti_Profiler_Host_EvaluateToGpuValues_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostConfigAddMetrics
+ */
+typedef struct CUpti_Profiler_Host_ConfigAddMetrics_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    struct CUpti_Profiler_Host_Object* pHostObject;
+    /// [in] metric names for which config image will be generated
+    const char** ppMetricNames;
+    /// [in] number of metrics
+    size_t numMetrics;
+} CUpti_Profiler_Host_ConfigAddMetrics_Params;
+
+#define CUpti_Profiler_Host_ConfigAddMetrics_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_ConfigAddMetrics_Params, numMetrics)
+
+/**
+ * \brief Add the metrics to the profiler host object for generating the config image.
+ * The config image will have the required information to schedule the metrics for
+ * collecting the profiling data. 
+ * Note: PM sampling only supports single pass config image.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_ConfigAddMetrics_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_METRIC_NAME if the metric name is not valid or not supported for the chip
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostConfigAddMetrics(CUpti_Profiler_Host_ConfigAddMetrics_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetConfigImageSize
+ */
+typedef struct CUpti_Profiler_Host_GetConfigImageSize_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    CUpti_Profiler_Host_Object* pHostObject;
+    /// [out] the size of config image, users need to allocate the buffer for storing
+    size_t configImageSize;
+} CUpti_Profiler_Host_GetConfigImageSize_Params;
+
+#define CUpti_Profiler_Host_GetConfigImageSize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetConfigImageSize_Params, configImageSize)
+
+/**
+ * \brief Get the size of the config image for the metrics added to the profiler host object.
+ * Users need to allocate the buffer for storing the config image.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetConfigImageSize_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetConfigImageSize(CUpti_Profiler_Host_GetConfigImageSize_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetConfigImage
+ */
+typedef struct CUpti_Profiler_Host_GetConfigImage_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] reference to the profiler host object allocated by CUPTI in cuptiProfilerHostInitialize
+    CUpti_Profiler_Host_Object* pHostObject;
+    /// [in] Number of bytes allocated for pBuffer
+    size_t configImageSize;
+    /// [out] Buffer receiving the config image
+    uint8_t* pConfigImage;
+} CUpti_Profiler_Host_GetConfigImage_Params;
+
+#define CUpti_Profiler_Host_GetConfigImage_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetConfigImage_Params, pConfigImage)
+
+/**
+ * \brief Get the config image for the metrics added to the profiler host object.
+ * User will pass the allocated buffer to store the config image.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetConfigImage_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetConfigImage(CUpti_Profiler_Host_GetConfigImage_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetNumOfPasses
+ */
+typedef struct CUpti_Profiler_Host_GetNumOfPasses_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] Number of bytes allocated for pConfigImage
+    size_t configImageSize;
+    /// [in] the config image buffer
+    uint8_t* pConfigImage;
+    /// [out] number of passes required for profiling scheduled metrics in the config image
+    size_t numOfPasses;
+} CUpti_Profiler_Host_GetNumOfPasses_Params;
+
+#define CUpti_Profiler_Host_GetNumOfPasses_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetNumOfPasses_Params, numOfPasses)
+
+/**
+ * \brief Get the number of passes required for profiling the scheduled metrics in the config image.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetNumOfPasses_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetNumOfPasses(CUpti_Profiler_Host_GetNumOfPasses_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerHostGetMaxNumHardwareMetricsPerPass
+ */
+typedef struct CUpti_Profiler_Host_GetMaxNumHardwareMetricsPerPass_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Assign to NULL
+    void* pPriv;
+    /// [in] the profiler kind one from CUpti_ProfilerType
+    CUpti_ProfilerType profilerType;
+    /// [in] accepted for chips supported at the time-of-release.
+    const char* pChipName;
+    /// [in] buffer with counter availability image - required for future chip support
+    uint8_t* pCounterAvailabilityImage;
+    /// [out] maximum number of metrics that can be scheduled in a pass
+    size_t maxMetricsPerPass;
+} CUpti_Profiler_Host_GetMaxNumHardwareMetricsPerPass_Params;
+
+#define CUpti_Profiler_Host_GetMaxNumHardwareMetricsPerPass_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Host_GetMaxNumHardwareMetricsPerPass_Params, maxMetricsPerPass)
+
+/**
+ * \brief Get the maximum number of hardware metrics (metric names which doesn't include _sass_ keyword)
+ * that can be scheduled in a single pass for a chip. While this represents a theoretical upper limit,
+ * practical constraints may prevent reaching this threshold for a specific set of metrics. Furthermore,
+ * the maximum achievable value is contingent upon the characteristics and architecture of the chip in question.
+ * 
+ * Use cuptiProfilerHostGetNumOfPasses API for getting the actual number of passes required for the
+ * for collecting the profiling data for the scheduled metrics in a config image.
+ * 
+ * \param pParams A pointer to \ref CUpti_Profiler_Host_GetMaxNumHardwareMetricsPerPass_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiProfilerHostGetMaxNumHardwareMetricsPerPass(CUpti_Profiler_Host_GetMaxNumHardwareMetricsPerPass_Params* pParams);
+
+/** @} */ /* END CUPTI_METRIC_API */
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif
\ No newline at end of file
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_profiler_target.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_profiler_target.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8fc197073dcb3bdec1a7349d136ac03434dc932
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_profiler_target.h
@@ -0,0 +1,602 @@
+/*
+ * Copyright 2011-2023   NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_PROFILER_TARGET_H_)
+#define _CUPTI_PROFILER_TARGET_H_
+
+#include <cuda.h>
+#include <cupti_result.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_PROFILER_API CUPTI Profiling API
+ * Functions, types, and enums that implement the CUPTI Profiling API.
+ * @{
+ */
+#ifndef CUPTI_PROFILER_STRUCT_SIZE
+#define CUPTI_PROFILER_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+/**
+ * \brief Profiler range attribute
+ *
+ * A metric enabled in the session's configuration is collected separately per unique range-stack in the pass.
+ * This is an attribute to collect metrics around each kernel in a profiling session or in an user defined range.
+ */
+typedef enum
+{
+    /**
+     * Invalid value
+     */
+    CUPTI_Range_INVALID,
+    /**
+     * Ranges are auto defined around each kernel in a profiling session
+     */
+    CUPTI_AutoRange,
+    /**
+     * A range in which metric data to be collected is defined by the user
+     */
+    CUPTI_UserRange,
+    /**
+     * Range count
+     */
+    CUPTI_Range_COUNT,
+} CUpti_ProfilerRange;
+
+/**
+ * \brief Profiler replay attribute
+ *
+ * For metrics which require multipass collection, a replay of the GPU kernel(s) is required.
+ * This is an attribute which specify how the replay of the kernel(s) to be measured is done.
+ */
+typedef enum
+{
+    /**
+     * Invalid Value
+     */
+    CUPTI_Replay_INVALID,
+    /**
+     * Replay is done by CUPTI user around the process
+     */
+    CUPTI_ApplicationReplay,
+    /**
+     * Replay is done around kernel implicitly by CUPTI
+     */
+    CUPTI_KernelReplay,
+    /**
+     * Replay is done by CUPTI user within a process
+     */
+    CUPTI_UserReplay,
+    /**
+     * Replay count
+     */
+    CUPTI_Replay_COUNT,
+} CUpti_ProfilerReplayMode;
+
+/**
+ * \brief Default parameter for cuptiProfilerInitialize
+ */
+typedef struct CUpti_Profiler_Initialize_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_Initialize_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+} CUpti_Profiler_Initialize_Params;
+#define CUpti_Profiler_Initialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Initialize_Params, pPriv)
+
+/**
+ * \brief Default parameter for cuptiProfilerDeInitialize
+ */
+typedef struct CUpti_Profiler_DeInitialize_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+} CUpti_Profiler_DeInitialize_Params;
+#define CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_DeInitialize_Params, pPriv)
+
+/**
+ * \brief Initializes the profiler interface
+ *
+ * Loads the required libraries in the process address space.
+ * Sets up the hooks with the CUDA driver.
+ */
+CUptiResult CUPTIAPI cuptiProfilerInitialize(CUpti_Profiler_Initialize_Params *pParams);
+
+/**
+ * \brief DeInitializes the profiler interface
+ */
+CUptiResult CUPTIAPI cuptiProfilerDeInitialize(CUpti_Profiler_DeInitialize_Params *pParams);
+
+/**
+ * \brief Input parameter to define the counterDataImage
+ */
+typedef struct CUpti_Profiler_CounterDataImageOptions
+{
+    size_t structSize;                                          //!< [in] CUpti_Profiler_CounterDataImageOptions_Params_STRUCT_SIZE
+    void* pPriv;                                                //!< [in] assign to NULL
+
+    const uint8_t* pCounterDataPrefix;                          /**< [in] Address of CounterDataPrefix generated from NVPW_CounterDataBuilder_GetCounterDataPrefix().
+                                                                    Must be align(8).*/
+    size_t counterDataPrefixSize;                               //!< [in] Size of CounterDataPrefix generated from NVPW_CounterDataBuilder_GetCounterDataPrefix().
+    uint32_t maxNumRanges;                                      //!< [in] Maximum number of ranges that can be profiled
+    uint32_t maxNumRangeTreeNodes;                              //!< [in] Maximum number of RangeTree nodes; must be >= maxNumRanges
+    uint32_t maxRangeNameLength;                                //!< [in] Maximum string length of each RangeName, including the trailing NULL character
+} CUpti_Profiler_CounterDataImageOptions;
+#define CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE                       CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImageOptions, maxRangeNameLength)
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageCalculateSize
+ */
+typedef struct CUpti_Profiler_CounterDataImage_CalculateSize_Params
+{
+    size_t structSize;                                          //!< [in] CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE
+    void* pPriv;                                                //!< [in] assign to NULL
+
+    size_t sizeofCounterDataImageOptions;                       //!< [in] CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE
+    const CUpti_Profiler_CounterDataImageOptions* pOptions;     //!< [in] Pointer to Counter Data Image Options
+    size_t counterDataImageSize;                                //!< [out]
+} CUpti_Profiler_CounterDataImage_CalculateSize_Params;
+#define CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE         CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_CalculateSize_Params, counterDataImageSize)
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageInitialize
+ */
+typedef struct CUpti_Profiler_CounterDataImage_Initialize_Params
+{
+    size_t structSize;                                          //!< [in] CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE
+    void* pPriv;                                                //!< [in] assign to NULL
+
+    size_t sizeofCounterDataImageOptions;                       //!< [in] CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE
+    const CUpti_Profiler_CounterDataImageOptions* pOptions;     //!< [in] Pointer to Counter Data Image Options
+    size_t counterDataImageSize;                                //!< [in] Size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                                 //!< [in] The buffer to be initialized.
+} CUpti_Profiler_CounterDataImage_Initialize_Params;
+#define CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE            CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_Initialize_Params, pCounterDataImage)
+
+/**
+ * \brief A CounterData image allocates space for values for each counter for each range.
+ *
+ * User borne the resposibility of managing the counterDataImage allocations.
+ * CounterDataPrefix contains meta data about the metrics that will be stored in counterDataImage.
+ * Use these APIs to calculate the allocation size and initialize counterData image.
+ */
+CUptiResult CUPTIAPI cuptiProfilerCounterDataImageCalculateSize(CUpti_Profiler_CounterDataImage_CalculateSize_Params* pParams);
+CUptiResult CUPTIAPI cuptiProfilerCounterDataImageInitialize(CUpti_Profiler_CounterDataImage_Initialize_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageCalculateScratchBufferSize
+ */
+typedef struct CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    size_t counterDataImageSize;                            //!< [in] size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                             //!< [in]
+    size_t counterDataScratchBufferSize;                    //!< [out]
+} CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params;
+#define CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE    CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params, counterDataScratchBufferSize)
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageInitializeScratchBuffer
+ */
+typedef struct CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    size_t counterDataImageSize;                            //!< [in] size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                             //!< [in]
+    size_t counterDataScratchBufferSize;                    //!< [in] size calculated using cuptiProfilerCounterDataImageCalculateScratchBufferSize
+    uint8_t* pCounterDataScratchBuffer;                     //!< [in] the scratch buffer to be initialized.
+} CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params;
+#define CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE       CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params, pCounterDataScratchBuffer)
+
+/**
+ * \brief A temporary storage for CounterData image needed for internal operations
+ *
+ * Use these APIs to calculate the allocation size and initialize counterData image scratch buffer.
+ */
+CUptiResult CUPTIAPI cuptiProfilerCounterDataImageCalculateScratchBufferSize(CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params* pParams);
+CUptiResult CUPTIAPI cuptiProfilerCounterDataImageInitializeScratchBuffer(CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerBeginSession
+ */
+typedef struct CUpti_Profiler_BeginSession_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_BeginSession_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    size_t counterDataImageSize;                            //!< [in] size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                             //!< [in] address of CounterDataImage
+    size_t counterDataScratchBufferSize;                    //!< [in] size calculated from cuptiProfilerCounterDataImageInitializeScratchBuffer
+    uint8_t* pCounterDataScratchBuffer;                     //!< [in] address of CounterDataImage scratch buffer
+    uint8_t bDumpCounterDataInFile;                          //!< [in] [optional]
+    const char* pCounterDataFilePath;                        //!< [in] [optional]
+    CUpti_ProfilerRange range;                               //!< [in] CUpti_ProfilerRange
+    CUpti_ProfilerReplayMode replayMode;                     //!< [in] CUpti_ProfilerReplayMode
+    /* Replay options, required when replay is done by cupti user */
+    size_t maxRangesPerPass;                                //!< [in] Maximum number of ranges that can be recorded in a single pass.
+    size_t maxLaunchesPerPass;                              //!< [in] Maximum number of kernel launches that can be recorded in a single pass; must be >= maxRangesPerPass.
+
+} CUpti_Profiler_BeginSession_Params;
+#define CUpti_Profiler_BeginSession_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_BeginSession_Params, maxLaunchesPerPass)
+/**
+ * \brief Params for cuptiProfilerEndSession
+ */
+typedef struct CUpti_Profiler_EndSession_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_EndSession_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_EndSession_Params;
+#define CUpti_Profiler_EndSession_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_EndSession_Params, ctx)
+
+/**
+ * \brief Begin profiling session sets up the profiling on the device
+ *
+ * Although, it doesn't start the profiling but GPU resources needed for profiling are allocated.
+ * Outside of a session, the GPU will return to its normal operating state.
+ */
+CUptiResult CUPTIAPI cuptiProfilerBeginSession(CUpti_Profiler_BeginSession_Params* pParams);
+/**
+ * \brief Ends profiling session
+ *
+ * Frees up the GPU resources acquired for profiling.
+ * Outside of a session, the GPU will return to it's normal operating state.
+ */
+CUptiResult CUPTIAPI cuptiProfilerEndSession(CUpti_Profiler_EndSession_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerSetConfig
+ */
+typedef struct CUpti_Profiler_SetConfig_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_SetConfig_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    const uint8_t* pConfig;                                 //!< [in] Config created by NVPW_RawMetricsConfig_GetConfigImage(). Must be align(8).
+    size_t configSize;                                      //!< [in] size of config
+    uint16_t minNestingLevel;                               //!< [in] the lowest nesting level to be profiled; must be >= 1
+    uint16_t numNestingLevels;                              //!< [in] the number of nesting levels to profile; must be >= 1
+    size_t passIndex;                                       //!< [in] Set this to zero for in-app replay; set this to the output of EndPass() for application replay
+    uint16_t targetNestingLevel;                            //!< [in] Set this to minNestingLevel for in-app replay; set this to the output of EndPass() for application
+} CUpti_Profiler_SetConfig_Params;
+
+#define CUpti_Profiler_SetConfig_Params_STRUCT_SIZE                    CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_SetConfig_Params, targetNestingLevel)
+
+/**
+ * \brief Params for cuptiProfilerUnsetConfig
+ */
+typedef struct CUpti_Profiler_UnsetConfig_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_UnsetConfig_Params;
+#define CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_UnsetConfig_Params, ctx)
+
+/**
+ * \brief Set metrics configuration to be profiled
+ *
+ * Use these APIs to set the config to profile in a session. It can be used for advanced cases such as where multiple
+ * configurations are collected into a single CounterData Image on the need basis, without restarting the session.
+ */
+CUptiResult CUPTIAPI cuptiProfilerSetConfig(CUpti_Profiler_SetConfig_Params* pParams);
+/**
+ * \brief Unset metrics configuration profiled
+ *
+ */
+CUptiResult CUPTIAPI cuptiProfilerUnsetConfig(CUpti_Profiler_UnsetConfig_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerBeginPass
+ */
+typedef struct CUpti_Profiler_BeginPass_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_BeginPass_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_BeginPass_Params;
+#define CUpti_Profiler_BeginPass_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_BeginPass_Params, ctx)
+
+/**
+ * \brief Params for cuptiProfilerEndPass
+ */
+typedef struct CUpti_Profiler_EndPass_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_EndPass_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    uint16_t targetNestingLevel;                            //!  [out] The targetNestingLevel that will be collected by the *next* BeginPass.
+    size_t passIndex;                                       //!< [out] The passIndex that will be collected by the *next* BeginPass
+    uint8_t allPassesSubmitted;                             //!< [out] becomes true when the last pass has been queued to the GPU
+} CUpti_Profiler_EndPass_Params;
+#define CUpti_Profiler_EndPass_Params_STRUCT_SIZE                    CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_EndPass_Params, allPassesSubmitted)
+
+/**
+ * \brief Replay API: used for multipass collection.
+
+ * These APIs are used if user chooses to replay by itself \ref CUPTI_UserReplay or \ref CUPTI_ApplicationReplay
+ * for multipass collection of the metrics configurations.
+ * It's a no-op in case of \ref CUPTI_KernelReplay.
+ */
+CUptiResult CUPTIAPI cuptiProfilerBeginPass(CUpti_Profiler_BeginPass_Params* pParams);
+
+/**
+ * \brief Replay API: used for multipass collection.
+
+ * These APIs are used if user chooses to replay by itself \ref CUPTI_UserReplay or \ref CUPTI_ApplicationReplay
+ * for multipass collection of the metrics configurations.
+ * Its a no-op in case of \ref CUPTI_KernelReplay.
+ * Returns information for next pass.
+ */
+CUptiResult CUPTIAPI cuptiProfilerEndPass(CUpti_Profiler_EndPass_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerEnableProfiling
+ */
+typedef struct CUpti_Profiler_EnableProfiling_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_EnableProfiling_Params;
+#define CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_EnableProfiling_Params, ctx)
+
+/**
+ * \brief Params for cuptiProfilerDisableProfiling
+ */
+typedef struct CUpti_Profiler_DisableProfiling_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_DisableProfiling_Params;
+#define CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_DisableProfiling_Params, ctx)
+
+/**
+ * \brief Enables Profiling
+ *
+ * In \ref CUPTI_AutoRange, these APIs are used to enable/disable profiling for the kernels to be executed in
+ * a profiling session.
+ */
+CUptiResult CUPTIAPI cuptiProfilerEnableProfiling(CUpti_Profiler_EnableProfiling_Params* pParams);
+
+/**
+ * \brief Disable Profiling
+ *
+ * In \ref CUPTI_AutoRange, these APIs are used to enable/disable profiling for the kernels to be executed in
+ * a profiling session.
+ */
+CUptiResult CUPTIAPI cuptiProfilerDisableProfiling(CUpti_Profiler_DisableProfiling_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerIsPassCollected
+ */
+typedef struct CUpti_Profiler_IsPassCollected_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_IsPassCollected_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    size_t numRangesDropped;                                //!< [out] number of ranges whose data was dropped in the processed pass
+    size_t numTraceBytesDropped;                            //!< [out] number of bytes not written to TraceBuffer due to buffer full
+    uint8_t onePassCollected;                               //!< [out] true if a pass was successfully decoded
+    uint8_t allPassesCollected;                             //!< [out] becomes true when the last pass has been decoded
+} CUpti_Profiler_IsPassCollected_Params;
+#define CUpti_Profiler_IsPassCollected_Params_STRUCT_SIZE            CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_IsPassCollected_Params, allPassesCollected)
+
+/**
+ * \brief Asynchronous call to query if the submitted pass to GPU is collected
+ *
+ */
+CUptiResult CUPTIAPI cuptiProfilerIsPassCollected(CUpti_Profiler_IsPassCollected_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerFlushCounterData
+ */
+typedef struct CUpti_Profiler_FlushCounterData_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    size_t numRangesDropped;                                //!< [out] number of ranges whose data was dropped in the processed passes
+    size_t numTraceBytesDropped;                            //!< [out] number of bytes not written to TraceBuffer due to buffer full
+} CUpti_Profiler_FlushCounterData_Params;
+#define CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE           CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_FlushCounterData_Params, numTraceBytesDropped)
+
+/**
+ * \brief Decode all the submitted passes
+ *
+ * Flush Counter data API to ensure every pass is decoded into the counterDataImage passed at beginSession.
+ * This will cause the CPU/GPU sync to collect all the undecoded pass.
+ */
+CUptiResult CUPTIAPI cuptiProfilerFlushCounterData(CUpti_Profiler_FlushCounterData_Params* pParams);
+
+typedef struct CUpti_Profiler_PushRange_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_PushRange_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    const char* pRangeName;                                 //!< [in] specifies the range for subsequent launches; must not be NULL
+    size_t rangeNameLength;                                 //!< [in] assign to strlen(pRangeName) if known; if set to zero, the library will call strlen()
+} CUpti_Profiler_PushRange_Params;
+#define CUpti_Profiler_PushRange_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_PushRange_Params, rangeNameLength)
+
+typedef struct CUpti_Profiler_PopRange_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_PopRange_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_PopRange_Params;
+#define CUpti_Profiler_PopRange_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_PopRange_Params, ctx)
+
+
+/**
+ * \brief Range API's : Push user range
+ *
+ * Counter data is collected per unique range-stack. Identified by a string label passsed by the user.
+ * It's an invalid operation in case of \ref CUPTI_AutoRange.
+ */
+CUptiResult CUPTIAPI cuptiProfilerPushRange(CUpti_Profiler_PushRange_Params *pParams);
+
+/**
+ * \brief Range API's : Pop user range
+ *
+ * Counter data is collected per unique range-stack. Identified by a string label passsed by the user.
+ * It's an invalid operation in case of \ref CUPTI_AutoRange.
+ */
+CUptiResult CUPTIAPI cuptiProfilerPopRange(CUpti_Profiler_PopRange_Params *pParams);
+
+/**
+ * \brief Params for cuptiProfilerGetCounterAvailability
+ */
+typedef struct CUpti_Profiler_GetCounterAvailability_Params
+{
+    size_t structSize;                                  //!< [in] CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE
+    void* pPriv;                                        //!< [in] assign to NULL
+    CUcontext ctx;                                      //!< [in] if NULL, the current CUcontext is used
+    size_t counterAvailabilityImageSize;                //!< [in/out] If `pCounterAvailabilityImage` is NULL, then the required size is returned in
+                                                        //!< `counterAvailabilityImageSize`, otherwise `counterAvailabilityImageSize` should be set to the size of
+                                                        //!< `pCounterAvailabilityImage`, and on return it would be overwritten with number of actual bytes copied
+    uint8_t* pCounterAvailabilityImage;                 //!< [in] buffer receiving counter availability image, may be NULL
+} CUpti_Profiler_GetCounterAvailability_Params;
+#define CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_GetCounterAvailability_Params, pCounterAvailabilityImage)
+
+/**
+ * \brief Query counter availibility
+ *
+ * Use this API to query counter availability information in a buffer which can be used to filter unavailable raw metrics on host.
+ * Note: This API may fail, if any profiling or sampling session is active on the specified context or its device.
+ */
+CUptiResult CUPTIAPI cuptiProfilerGetCounterAvailability(CUpti_Profiler_GetCounterAvailability_Params *pParams);
+
+/// Generic support level enum for CUPTI
+typedef enum
+{
+    CUPTI_PROFILER_CONFIGURATION_UNKNOWN = 0, //!< Configuration support level unknown - either detection code errored out before setting this value, or unable to determine it
+    CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED, //!< Profiling is unavailable.  For specific feature fields, this means that the current configuration of this feature does not work with profiling.  For instance, SLI-enabled devices do not support profiling, and this value would be returned for SLI on an SLI-enabled device.
+    CUPTI_PROFILER_CONFIGURATION_DISABLED,    //!< Profiling would be available for this configuration, but was disabled by the system
+    CUPTI_PROFILER_CONFIGURATION_SUPPORTED    //!< Profiling is supported.  For specific feature fields, this means that the current configuration of this feature works with profiling.  For instance, SLI-enabled devices do not support profiling, and this value would only be returned for devices which are not SLI-enabled.
+} CUpti_Profiler_Support_Level;
+
+/**
+ * \brief Profiler API types
+ */
+typedef enum
+{
+    CUPTI_PROFILER_RANGE_PROFILING = 0,       //!< CUPTI APIs for range based profiling (cuptiProfiler*)
+    CUPTI_PROFILER_PC_SAMPLING,               //!< CUPTI APIs collecting pc sampling data (cuptiPcSampling*)
+    CUPTI_PROFILER_SASS_METRICS,              //!< CUPTI APIs collecting SASS metrics data (cuptiSassMetrics*)
+    CUPTI_PROFILER_PM_SAMPLING,               //!< CUPTI APIs collecting PM Sampling data (cuptiPmSampling*)
+    CUPTI_PROFILER_UNKNOWN
+} CUpti_Profiler_API;
+
+/**
+ * \brief Params for cuptiProfilerDeviceSupported
+ */
+typedef struct
+{
+    size_t structSize;                                //!< [in] Must be CUpti_Profiler_DeviceSupported_Params_STRUCT_SIZE
+    void *pPriv;                                      //!< [in] assign to NULL
+    CUdevice cuDevice;                                //!< [in] if NULL, the current CUcontext is used
+
+    CUpti_Profiler_Support_Level isSupported;         //!< [out] overall SUPPORTED / UNSUPPORTED flag representing whether Profiling and PC Sampling APIs work on the given device and configuration. SUPPORTED if all following flags are SUPPORTED, UNSUPPORTED otherwise.
+
+    CUpti_Profiler_Support_Level architecture;        //!< [out] SUPPORTED if the device architecture level supports the Profiling API (Compute Capability >= 7.0), UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level sli;                 //!< [out] SUPPORTED if SLI is not enabled, UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level vGpu;                //!< [out] SUPPORTED if vGPU is supported and profiling is enabled, DISABLED if profiling is supported but not enabled, UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level confidentialCompute; //!< [out] SUPPORTED if confidential compute is not enabled, UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level cmp;                 //!< [out] SUPPORTED if not NVIDIA Crypto Mining Processors (CMP), UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level wsl;                 //!< [out] SUPPORTED if WSL supported, UNSUPPORTED otherwise
+    CUpti_Profiler_API     api;                       //!< [in] the CUPTI API type for which device support will be checked
+} CUpti_Profiler_DeviceSupported_Params;
+#define CUpti_Profiler_DeviceSupported_Params_STRUCT_SIZE CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_DeviceSupported_Params, api)
+
+/**
+ * \brief Query device compatibility with Profiling API
+ *
+ * Use this call to determine whether a compute device and configuration are compatible with the Profiling API.
+ * If the configuration does not support profiling, one of several flags will indicate why.
+ */
+CUptiResult CUPTIAPI cuptiProfilerDeviceSupported(CUpti_Profiler_DeviceSupported_Params *pParams);
+
+/** @} */ /* END CUPTI_METRIC_API */
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /*_CUPTI_PROFILER_TARGET_H_*/
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_range_profiler.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_range_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebcb25c0921bf473df943d63f476b877fdec2d66
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_range_profiler.h
@@ -0,0 +1,465 @@
+/*
+ * Copyright 2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_RANGE_PROFILER_H_)
+#define _CUPTI_RANGE_PROFILER_H_
+
+#include <cuda.h>
+#include <cupti_result.h>
+#include <cupti_profiler_target.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_RANGE_PROFILER_API CUPTI Range Profiling API
+ * Functions, types, and enums that implement the CUPTI Range Profiling API.
+ * @{
+ */
+#ifndef CUPTI_PROFILER_STRUCT_SIZE
+#define CUPTI_PROFILER_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+
+typedef struct CUpti_RangeProfiler_Object CUpti_RangeProfiler_Object;
+
+/**
+ * \brief Params for cuptiRangeProfilerSetConfig
+ */
+typedef struct CUpti_RangeProfiler_SetConfig_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+    /// [in] Size of the config image.
+    size_t configSize;
+    /// [in] Config image.
+    const uint8_t* pConfig;
+    /// [in] Size of the counter data image.
+    size_t counterDataImageSize;
+    /// [in] Counter data image.
+    uint8_t* pCounterDataImage;
+    /// [in] Profiling Range mode.
+    CUpti_ProfilerRange range;
+    /// [in] Replay mode.
+    CUpti_ProfilerReplayMode replayMode;
+    /// [in] Maximum number of ranges that can be profiled in a pass.
+    size_t maxRangesPerPass;
+    /// [in] number of nesting level to be profiled. For Auto range mode, this should be set to 1.
+    uint16_t numNestingLevels;
+    /// [in] minimum nesting level to be profiled.
+    uint16_t minNestingLevel;
+    /// [in] Pass index for the replay session.
+    size_t passIndex;
+    /// [in] Target nesting level for the replay session.
+    uint16_t targetNestingLevel;
+} CUpti_RangeProfiler_SetConfig_Params;
+
+#define CUpti_RangeProfiler_SetConfig_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_SetConfig_Params, targetNestingLevel)
+
+/**
+ * \brief Set the configuration for range profiler like maximum number of ranges per pass, number of nesting levels,
+ * range and replay mode and the config image which has scheduling info for metric collection.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_SetConfig_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ */
+CUptiResult CUPTIAPI cuptiRangeProfilerSetConfig(CUpti_RangeProfiler_SetConfig_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerEnable
+ */
+typedef struct CUpti_RangeProfiler_Enable_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Context to be used for profiling.
+    CUcontext ctx;
+    /// [out] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+} CUpti_RangeProfiler_Enable_Params;
+#define CUpti_RangeProfiler_Enable_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_Enable_Params, pRangeProfilerObject)
+
+/**
+ * \brief Create a range profiler object and enable range profiling on the CUDA context.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_Enable_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY if memory allocation fails while creating the PM sampling object
+ * \retval CUPTI_ERROR_INSUFFICIENT_PRIVILEGES if the user does not have sufficient privileges to perform the operation
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiRangeProfilerEnable(CUpti_RangeProfiler_Enable_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerDisable
+ */
+typedef struct CUpti_RangeProfiler_Disable_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+} CUpti_RangeProfiler_Disable_Params;
+#define CUpti_RangeProfiler_Disable_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_Disable_Params, pRangeProfilerObject)
+
+/**
+ * \brief Disable the range profiler on the CUDA context and destroy the range profiler object.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_Disable_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ */
+CUptiResult CUPTIAPI cuptiRangeProfilerDisable(CUpti_RangeProfiler_Disable_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerStart
+ */
+typedef struct CUpti_RangeProfiler_Start_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+} CUpti_RangeProfiler_Start_Params;
+#define CUpti_RangeProfiler_Start_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_Start_Params, pRangeProfilerObject)
+
+/**
+ * \brief Start the range profiler.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_Start_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if range profiler Start is called without enabling range profiler
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiRangeProfilerStart(CUpti_RangeProfiler_Start_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerStop
+ */
+typedef struct CUpti_RangeProfiler_Stop_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+    /// [out] pass index for the replay session.
+    size_t passIndex;
+    /// [out] target nesting level for the replay session.
+    size_t targetNestingLevel;
+    /// [out] 1 if all passes are submitted to GPU for collection, 0 otherwise.
+    uint8_t isAllPassSubmitted;
+} CUpti_RangeProfiler_Stop_Params;
+#define CUpti_RangeProfiler_Stop_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_Stop_Params, isAllPassSubmitted)
+
+/**
+ * \brief Stop the range profiler.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_Stop_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if range profiler Stop is called without enabling range profiler
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+ */
+CUptiResult CUPTIAPI cuptiRangeProfilerStop(CUpti_RangeProfiler_Stop_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerPushRange
+ */
+typedef struct CUpti_RangeProfiler_PushRange_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+    /// [in] Name of the range to be profiled (only valid for User range mode).
+    const char* pRangeName;
+} CUpti_RangeProfiler_PushRange_Params;
+#define CUpti_RangeProfiler_PushRange_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_PushRange_Params, pRangeName)
+
+/**
+ * \brief Add a new range to the Range Profiler with a given range name.
+ * For nested ranges, this API should be called again for the innermost range. For profiling the nested
+ * range, users need to set the values for minNestingLevel and numNestingLevels in the SetConfig API.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_PushRange_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if range profiler PushRange is called without enabling range profiler
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+*/
+CUptiResult CUPTIAPI cuptiRangeProfilerPushRange(CUpti_RangeProfiler_PushRange_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerPopRange
+ */
+typedef struct CUpti_RangeProfiler_PopRange_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+} CUpti_RangeProfiler_PopRange_Params;
+#define CUpti_RangeProfiler_PopRange_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_PopRange_Params, pRangeProfilerObject)
+
+/**
+ * \brief pop the current range to the Range Profiler.
+ * The number of pop range API call should be same as number of push ranges in the same order.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_PopRange_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if range profiler PopRange is called without enabling range profiler
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+*/
+CUptiResult CUPTIAPI cuptiRangeProfilerPopRange(CUpti_RangeProfiler_PopRange_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerDecodeData
+ */
+typedef struct CUpti_RangeProfiler_DecodeData_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Range Profiler Object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+    /// [out] Number of ranges dropped in the processed passes.
+    size_t numOfRangeDropped;
+} CUpti_RangeProfiler_DecodeData_Params;
+#define CUpti_RangeProfiler_DecodeData_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_DecodeData_Params, numOfRangeDropped)
+
+/**
+ * \brief Decode the profiling data stored in the hardware to the counter data image passed in the
+ * SetConfig API. This API should be called after cuptiRangeProfilerStop. The counter data image
+ * will be updated with the profiling data for the ranges profiled.
+ * 
+ * For the cases where the number of ranges counter data image can store is less than the number of ranges
+ * profiled (= maxRangesPerPass in SetConfig API), the counter data image will report dropped ranges.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_DecodeData_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if range profiler DecodeData is called without enabling range profiler
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+*/
+CUptiResult CUPTIAPI cuptiRangeProfilerDecodeData(CUpti_RangeProfiler_DecodeData_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerGetCounterDataSize
+ */
+typedef struct CUpti_RangeProfiler_GetCounterDataSize_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Periodic sampler object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+    /// [in] Names of the metrics to be collected.
+    const char** pMetricNames;
+    /// [in] Number of metrics to be collected.
+    size_t numMetrics;
+    /// [in] Maximum number of ranges to be stored in the counter data image.
+    size_t maxNumOfRanges;
+    /// [in] Maximum number of RangeTree nodes; must be >= maxNumOfRanges
+    uint32_t maxNumRangeTreeNodes;
+    /// [out] Size of the counter data image.
+    size_t counterDataSize;
+} CUpti_RangeProfiler_GetCounterDataSize_Params;
+#define CUpti_RangeProfiler_GetCounterDataSize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_GetCounterDataSize_Params, counterDataSize)
+
+/**
+ * \brief Get the size of the counter data image required to store the profiling data for the ranges profiled.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_GetCounterDataSize_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if range profiler GetCounterDataSize is called without enabling range profiler
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+*/
+CUptiResult CUPTIAPI cuptiRangeProfilerGetCounterDataSize(CUpti_RangeProfiler_GetCounterDataSize_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerCounterDataImageInitialize
+ */
+typedef struct CUpti_RangeProfiler_CounterDataImage_Initialize_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Periodic sampler object.
+    CUpti_RangeProfiler_Object* pRangeProfilerObject;
+    /// [in] Size of the counter data image.
+    size_t counterDataSize;
+    /// [in] Counter data image.
+    uint8_t* pCounterData;
+} CUpti_RangeProfiler_CounterDataImage_Initialize_Params;
+#define CUpti_RangeProfiler_CounterDataImage_Initialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_CounterDataImage_Initialize_Params, pCounterData)
+
+/**
+ * \brief Initialize the counter data image with the profiling data for the ranges profiled.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_CounterDataImage_Initialize_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_OPERATION if range profiler CounterDataImageInitialize is called without enabling range profiler
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+*/
+CUptiResult CUPTIAPI cuptiRangeProfilerCounterDataImageInitialize(CUpti_RangeProfiler_CounterDataImage_Initialize_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerGetCounterDataInfo
+ */
+typedef struct CUpti_RangeProfiler_GetCounterDataInfo_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Counter data image.
+    const uint8_t* pCounterDataImage;
+    /// [in] Size of the counter data image.
+    size_t counterDataImageSize;
+    /// [out] Number of ranges in the counter data image.
+    size_t numTotalRanges;
+} CUpti_RangeProfiler_GetCounterDataInfo_Params;
+#define CUpti_RangeProfiler_GetCounterDataInfo_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_GetCounterDataInfo_Params, numTotalRanges)
+
+/**
+ * \brief Get the number of ranges stored in the counter data image.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_GetCounterDataInfo_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+*/
+CUptiResult CUPTIAPI cuptiRangeProfilerGetCounterDataInfo(CUpti_RangeProfiler_GetCounterDataInfo_Params* pParams);
+
+/**
+ * \brief Params for cuptiRangeProfilerCounterDataGetRangeInfo
+ */
+typedef struct CUpti_RangeProfiler_CounterData_GetRangeInfo_Params
+{
+    /// [in] Size of the data structure.
+    size_t structSize;
+    /// [in] Set to NULL.
+    void* pPriv;
+    /// [in] Counter data image.
+    const uint8_t* pCounterDataImage;
+    /// [in] Size of the counter data image.
+    size_t counterDataImageSize;
+    /// [in] Index of the sample.
+    size_t rangeIndex;
+    /// [in] range delimiter.
+    const char* rangeDelimiter;
+    /// [out] RangeName;
+    const char* rangeName;
+} CUpti_RangeProfiler_CounterData_GetRangeInfo_Params;
+#define CUpti_RangeProfiler_CounterData_GetRangeInfo_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_RangeProfiler_CounterData_GetRangeInfo_Params, rangeName)
+
+/**
+ * \brief Get the range name for the given range index.
+ * 
+ * \param pParams A pointer to \ref CUpti_RangeProfiler_CounterData_GetRangeInfo_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_UNKNOWN for any internal error
+*/
+CUptiResult CUPTIAPI cuptiRangeProfilerCounterDataGetRangeInfo(CUpti_RangeProfiler_CounterData_GetRangeInfo_Params* pParams);
+
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /*_CUPTI_RANGE_PROFILER_H_*/
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_result.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_result.h
new file mode 100644
index 0000000000000000000000000000000000000000..10371ac621b2472086a4d68af4dc9bdc91f8e417
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_result.h
@@ -0,0 +1,360 @@
+/*
+ * Copyright 2010-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_RESULT_H_)
+#define _CUPTI_RESULT_H_
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_RESULT_API CUPTI Result Codes
+ * Error and result codes returned by CUPTI functions.
+ * @{
+ */
+
+/**
+ * \brief CUPTI result codes.
+ *
+ * Error and result codes returned by CUPTI functions.
+ */
+typedef enum {
+    /**
+     * No error.
+     */
+    CUPTI_SUCCESS                                       = 0,
+    /**
+     * One or more of the parameters is invalid.
+     */
+    CUPTI_ERROR_INVALID_PARAMETER                       = 1,
+    /**
+     * The device does not correspond to a valid CUDA device.
+     */
+    CUPTI_ERROR_INVALID_DEVICE                          = 2,
+    /**
+     * The context is NULL or not valid.
+     */
+    CUPTI_ERROR_INVALID_CONTEXT                         = 3,
+    /**
+     * The event domain id is invalid.
+     */
+    CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID                 = 4,
+    /**
+     * The event id is invalid.
+     */
+    CUPTI_ERROR_INVALID_EVENT_ID                        = 5,
+    /**
+     * The event name is invalid.
+     */
+    CUPTI_ERROR_INVALID_EVENT_NAME                      = 6,
+    /**
+     * The current operation cannot be performed due to dependency on
+     * other factors.
+     */
+    CUPTI_ERROR_INVALID_OPERATION                       = 7,
+    /**
+     * Unable to allocate enough memory to perform the requested
+     * operation.
+     */
+    CUPTI_ERROR_OUT_OF_MEMORY                           = 8,
+    /**
+     * An error occurred on the performance monitoring hardware.
+     */
+    CUPTI_ERROR_HARDWARE                                = 9,
+    /**
+     * The output buffer size is not sufficient to return all
+     * requested data.
+     */
+    CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT           = 10,
+    /**
+     * API is not implemented.
+     */
+    CUPTI_ERROR_API_NOT_IMPLEMENTED                     = 11,
+    /**
+     * The maximum limit is reached.
+     */
+    CUPTI_ERROR_MAX_LIMIT_REACHED                       = 12,
+    /**
+     * The object is not yet ready to perform the requested operation.
+     */
+    CUPTI_ERROR_NOT_READY                               = 13,
+    /**
+     * The current operation is not compatible with the current state
+     * of the object
+     */
+    CUPTI_ERROR_NOT_COMPATIBLE                          = 14,
+    /**
+     * CUPTI is unable to initialize its connection to the CUDA
+     * driver.
+     */
+    CUPTI_ERROR_NOT_INITIALIZED                         = 15,
+    /**
+     * The metric id is invalid.
+     */
+    CUPTI_ERROR_INVALID_METRIC_ID                        = 16,
+    /**
+     * The metric name is invalid.
+     */
+    CUPTI_ERROR_INVALID_METRIC_NAME                      = 17,
+    /**
+     * The queue is empty.
+     */
+    CUPTI_ERROR_QUEUE_EMPTY                              = 18,
+    /**
+     * Invalid handle (internal?).
+     */
+    CUPTI_ERROR_INVALID_HANDLE                           = 19,
+    /**
+     * Invalid stream.
+     */
+    CUPTI_ERROR_INVALID_STREAM                           = 20,
+    /**
+     * Invalid kind.
+     */
+    CUPTI_ERROR_INVALID_KIND                             = 21,
+    /**
+     * Invalid event value.
+     */
+    CUPTI_ERROR_INVALID_EVENT_VALUE                      = 22,
+    /**
+     * CUPTI is disabled due to conflicts with other enabled profilers
+     */
+    CUPTI_ERROR_DISABLED                                 = 23,
+    /**
+     * Invalid module.
+     */
+    CUPTI_ERROR_INVALID_MODULE                           = 24,
+    /**
+     * Invalid metric value.
+     */
+    CUPTI_ERROR_INVALID_METRIC_VALUE                     = 25,
+    /**
+     * The performance monitoring hardware is in use by other client.
+     */
+    CUPTI_ERROR_HARDWARE_BUSY                            = 26,
+    /**
+     * The attempted operation is not supported on the current
+     * system or device.
+     */
+    CUPTI_ERROR_NOT_SUPPORTED                            = 27,
+    /**
+     * Unified memory profiling is not supported on the system.
+     * Potential reason could be unsupported OS or architecture.
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED               = 28,
+    /**
+     * Unified memory profiling is not supported on the device
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE     = 29,
+    /**
+     * Unified memory profiling is not supported on a multi-GPU
+     * configuration without P2P support between any pair of devices
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES = 30,
+    /**
+     * Unified memory profiling is not supported under the
+     * Multi-Process Service (MPS) environment. CUDA 7.5 removes this
+     * restriction.
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_WITH_MPS      = 31,
+    /**
+     * In CUDA 9.0, devices with compute capability 7.0 don't
+     * support CDP tracing
+     */
+    CUPTI_ERROR_CDP_TRACING_NOT_SUPPORTED                = 32,
+    /**
+     * Profiling on virtualized GPU is not supported.
+     */
+    CUPTI_ERROR_VIRTUALIZED_DEVICE_NOT_SUPPORTED         = 33,
+    /**
+     * Profiling results might be incorrect for CUDA applications
+     * compiled with nvcc version older than 9.0 for devices with
+     * compute capability 6.0 and 6.1.
+     * Profiling session will continue and CUPTI will notify it using this error code.
+     * User is advised to recompile the application code with nvcc version 9.0 or later.
+     * Ignore this warning if code is already compiled with the recommended nvcc version.
+     */
+    CUPTI_ERROR_CUDA_COMPILER_NOT_COMPATIBLE             = 34,
+    /**
+     * User doesn't have sufficient privileges which are required to
+     * start the profiling session.
+     * One possible reason for this may be that the NVIDIA driver or your system
+     * administrator may have restricted access to the NVIDIA GPU performance counters.
+     * To learn how to resolve this issue and find more information, please visit
+     * https://developer.nvidia.com/CUPTI_ERROR_INSUFFICIENT_PRIVILEGES
+     */
+    CUPTI_ERROR_INSUFFICIENT_PRIVILEGES                  = 35,
+    /**
+     * Legacy CUPTI Profiling API i.e. event API from the header cupti_events.h and
+     * metric API from the header cupti_metrics.h are not compatible with the
+     * Profiling API in the header cupti_profiler_target.h and Perfworks metrics API
+     * in the headers nvperf_host.h and nvperf_target.h.
+     */
+    CUPTI_ERROR_OLD_PROFILER_API_INITIALIZED             = 36,
+    /**
+     * Missing definition of the OpenACC API routine in the linked OpenACC library.
+     *
+     * One possible reason is that OpenACC library is linked statically in the
+     * user application, which might not have the definition of all the OpenACC
+     * API routines needed for the OpenACC profiling, as compiler might ignore
+     * definitions for the functions not used in the application. This issue
+     * can be mitigated by linking the OpenACC library dynamically.
+     */
+    CUPTI_ERROR_OPENACC_UNDEFINED_ROUTINE                = 37,
+    /**
+     * Legacy CUPTI Profiling API i.e. event API from the header cupti_events.h and
+     * metric API from the header cupti_metrics.h are not supported on devices with
+     * compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
+     * These APIs are deprecated in the CUDA 12.8 release and will be removed in a future CUDA release.
+     * These are replaced by the host profiling API in the header cupti_profiler_host.h and
+     * target profiling API in the header cupti_range_profiler.h which are supported on
+     * devices with compute capability 7.0 and higher (i.e. Volta and later GPU
+     * architectures).
+     */
+    CUPTI_ERROR_LEGACY_PROFILER_NOT_SUPPORTED            = 38,
+    /**
+     * CUPTI doesn't allow multiple callback subscribers. Only a single subscriber
+     * can be registered at a time.
+     * Same error code is used when application is launched using NVIDIA tools
+     * like nvprof, Visual Profiler, Nsight Systems, Nsight Compute, cuda-gdb and
+     * cuda-memcheck.
+     */
+    CUPTI_ERROR_MULTIPLE_SUBSCRIBERS_NOT_SUPPORTED       = 39,
+    /**
+     * Profiling on virtualized GPU is not allowed by hypervisor.
+     */
+    CUPTI_ERROR_VIRTUALIZED_DEVICE_INSUFFICIENT_PRIVILEGES = 40,
+    /**
+     * Profiling and tracing are not allowed when confidential computing mode
+     * is enabled.
+     */
+    CUPTI_ERROR_CONFIDENTIAL_COMPUTING_NOT_SUPPORTED = 41,
+    /**
+     * CUPTI does not support NVIDIA Crypto Mining Processors (CMP).
+     * For more information, please visit https://developer.nvidia.com/ERR_NVCMPGPU
+    */
+    CUPTI_ERROR_CMP_DEVICE_NOT_SUPPORTED = 42,
+    /**
+     * Profiling on Multi-instance GPU (MIG) is not supported.
+     */
+    CUPTI_ERROR_MIG_DEVICE_NOT_SUPPORTED = 43,
+    /**
+     * Profiling on SLI device is not supported.
+     */
+    CUPTI_ERROR_SLI_DEVICE_NOT_SUPPORTED = 44,
+    /**
+     * Profiling on WSL device is not supported.
+     */
+    CUPTI_ERROR_WSL_DEVICE_NOT_SUPPORTED = 45,
+    /**
+     * An unknown internal error has occurred.
+     */
+    CUPTI_ERROR_UNKNOWN                                  = 999,
+    CUPTI_ERROR_FORCE_INT                                = 0x7fffffff
+} CUptiResult;
+
+/**
+ * \brief Get the descriptive string for a CUptiResult.
+ *
+ * Return the descriptive string for a CUptiResult in \p *str.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param result The result to get the string for
+ * \param str Returns the string
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p str is NULL or \p
+ * result is not a valid CUptiResult
+ */
+CUptiResult CUPTIAPI cuptiGetResultString(CUptiResult result, const char **str);
+
+/**
+ * @brief Get the descriptive message corresponding to error codes returned
+ * by CUPTI.
+ * 
+ * Return the descriptive error message for a CUptiResult in \p *str.
+ * \note \b Thread-safety: this function is thread safe.
+ * 
+ * \param result The result to get the descriptive error message for
+ * \param str Returns the error message string
+ * 
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p str is NULL or \p
+ * result is not a valid CUptiResult
+ * 
+ */
+
+CUptiResult CUPTIAPI cuptiGetErrorMessage(CUptiResult result, const char **str);
+
+/** @} */ /* END CUPTI_RESULT_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_RESULT_H_*/
+
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_sass_metrics.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_sass_metrics.h
new file mode 100644
index 0000000000000000000000000000000000000000..acb59cf8e5882a5ff13b4a1b0fdc6bc7b0ec47f7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_sass_metrics.h
@@ -0,0 +1,436 @@
+/*
+ * Copyright 2023 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_SASS_METRICS_H_)
+#define _CUPTI_SASS_METRICS_H_
+
+#include <cuda.h>
+#include <cupti_result.h>
+#include <cupti_profiler_target.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_SASS_METRICS_API CUPTI SASS Metrics API
+ * Functions, types, and enums that implement the CUPTI SASS Metrics API.
+ * @{
+ */
+
+typedef enum
+{
+    /// SASS metric data will be collected at GPU level. 
+    /// In CUpti_SassMetricsGetDataProperties_Params struct the numOfInstances will be equal to 1
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_GPU = 0,
+
+    /// SASS metric data will be collected at SM level
+    /// In CUpti_SassMetricsGetDataProperties_Params struct the numOfInstances will be equal to number of SMs in the GPU
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_SM = 1,
+
+    /// SASS metric data will be collected at SM sub-partition level
+    /// In CUpti_SassMetricsGetDataProperties_Params struct the numOfInstances will be equal to number of SM sub-partitions in the GPU
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_SMSP = 2,
+
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_INVALID
+} CUpti_SassMetrics_OutputGranularity;
+
+typedef struct CUpti_SassMetrics_MetricDetails
+{
+    /// unique ID for the SASS metric
+    uint64_t metricId;
+    /// metric name
+    const char* pMetricName;
+    /// metric description
+    const char* pMetricDescription;
+} CUpti_SassMetrics_MetricDetails;
+
+/**
+ * \brief Params for cuptiSassMetricsGetNumOfMetrics
+ */
+typedef struct CUpti_SassMetrics_GetNumOfMetrics_Params
+{
+    /// [in] should be equal to CUpti_SassMetrics_GetNumOfMetrics_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] chip name for which metrics will be queried
+    const char* pChipName;
+    /// [out] number of metrics supported for the queried chip
+    size_t numOfMetrics;
+} CUpti_SassMetrics_GetNumOfMetrics_Params;
+
+#define CUpti_SassMetrics_GetNumOfMetrics_Params_STRUCT_SIZE                 CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_GetNumOfMetrics_Params, numOfMetrics)
+
+/**
+ * \brief Get the number of supported SASS metrics for the chip.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetrics_GetNumOfMetrics_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetNumOfMetrics(CUpti_SassMetrics_GetNumOfMetrics_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsGetMetrics
+ */
+typedef struct CUpti_SassMetrics_GetMetrics_Params
+{
+    /// [in] should be equal to CUpti_SassMetrics_GetMetrics_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] chip name for which metrics will be queried
+    const char* pChipName;
+    /// [in] number of metrics supported for the queried chip (can be queried using cuptiSassMetricsGetNumOfMetrics())
+    size_t numOfMetrics;
+    /// [out] list of metrics supported for queried chip
+    CUpti_SassMetrics_MetricDetails* pMetricsList;
+} CUpti_SassMetrics_GetMetrics_Params;
+#define CUpti_SassMetrics_GetMetrics_Params_STRUCT_SIZE                 CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_GetMetrics_Params, pMetricsList)
+
+/**
+ * \brief Get the list of all supported SASS metrics for the chip.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetrics_GetMetrics_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetMetrics(CUpti_SassMetrics_GetMetrics_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsGetProperties
+ */
+typedef struct CUpti_SassMetrics_GetProperties_Params
+{
+    /// [in] should be equal to CUpti_SassMetrics_GetProperties_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] chip name for which metric will be queried
+    const char* pChipName;
+    /// [in] metric name
+    const char* pMetricName;
+    /// [out] returns the metric ID and the metric description
+    CUpti_SassMetrics_MetricDetails metric;
+} CUpti_SassMetrics_GetProperties_Params;
+#define CUpti_SassMetrics_GetProperties_Params_STRUCT_SIZE        CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_GetProperties_Params, metric)
+
+/**
+ * \brief Get metric properties for the queried metric.
+ * For a given metric the results will be put in CUpti_SassMetrics_MetricDetails which
+ * stores metric ID, description of the metric.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetrics_GetProperties_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetProperties(CUpti_SassMetrics_GetProperties_Params *pParams);
+
+typedef struct CUpti_SassMetrics_Config
+{
+    /// [in] unique id for the SASS metric, can be queried using cuptiSassMetricsGetProperties()
+    uint64_t metricId;
+    /// [in] CUpti_SassMetrics_OutputGranularity
+    uint8_t outputGranularity;
+} CUpti_SassMetrics_Config;
+
+/**
+ * \brief Params for cuptiSassMetricsSetConfig
+ */
+typedef struct CUpti_SassMetricsSetConfig_Params
+{
+    /// [in] equal to CUpti_SassMetricsSetConfig_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] num of metric configs, will be equal to number of metrics queried
+    size_t numOfMetricConfig;
+    /// [in] list of metric config generated for given sass metrics
+    CUpti_SassMetrics_Config* pConfigs;
+    /// [in] device index for which config will be set, user can call this once for
+    /// the device on which the the SASS metric data will be collected
+    uint32_t deviceIndex;
+} CUpti_SassMetricsSetConfig_Params;
+#define CUpti_SassMetricsSetConfig_Params_STRUCT_SIZE                    CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsSetConfig_Params, deviceIndex)
+
+/**
+ * \brief Set config for the SASS metric data collection for a device.
+ * User need to call this API before calling any of the SASS metric data collection APIs.
+ * Each set config API call need to be followed by cuptiSassPatchingUnSetConfig API
+ * before calling the cuptiSassMetricsSetConfig() API again for the same device.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetricsSetConfig_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this is called multiple times for the device without calling unset config API
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsSetConfig(CUpti_SassMetricsSetConfig_Params *pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsUnsetConfig
+ */
+typedef struct CUpti_SassMetricsUnsetConfig_Params
+{
+    /// [in] equal to CUpti_SassMetricsUnsetConfig_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] device index for which SASS metric data collection config will get reset, user need to call this API for
+    /// all the devices on which the the SASS metric data collection have been configured.
+    uint32_t deviceIndex;
+} CUpti_SassMetricsUnsetConfig_Params;
+#define CUpti_SassMetricsUnsetConfig_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsUnsetConfig_Params, deviceIndex)
+
+/**
+ * \brief Unset config API will reset the SASS metric data collection configuration for the device.
+ * Once this API called CUPTI will deallocate all the memory allocated and remove all
+ * the configuration for SASS metric data collection. User can only call this API for a device where
+ * cuptiSassMetricsSetConfig() API has been called earlier for the device.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetricsSetConfig_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this is called multiple times for the device without calling set config API
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsUnsetConfig(CUpti_SassMetricsUnsetConfig_Params *pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsEnable
+ */
+typedef struct CUpti_SassMetricsEnable_Params
+{
+    /// [in] equal to CUpti_SassMetricsEnable_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection will be enabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [in] if false, all the functions will patched regardless of their execution with cuptiSassMetricsEnable() API call.
+    /// when this parameter is set to true, metric data collection for the function will be done at the very first execution in the enable/disble
+    /// range.
+    uint8_t enableLazyPatching;
+} CUpti_SassMetricsEnable_Params;
+#define CUpti_SassMetricsEnable_Params_STRUCT_SIZE                       CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsEnable_Params, enableLazyPatching)
+
+/**
+ * \brief Sass metric data collection enable API will mark the start of a range, between which kernel
+ *  will be profiled for SASS metrics.
+ *
+ * \param pParams A pointer to \ref CUpti_SassMetricsEnable_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called multiple times for a cuda context without calling 
+ * cuptiSassMetricsDisable() API or called before cuptiSassMetricsSetConfig() API call.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsEnable(CUpti_SassMetricsEnable_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsDisable
+ */
+typedef struct CUpti_SassMetricsDisable_Params
+{
+    /// [in] equal to CUpti_SassMetricsDisable_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection will be disabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [out] Num of dropped SASS records will be equal to numOfPatchedInstructions * numOfInstances.
+    /// Number of dropped records will be zero when data is flushed prior to calling the disable API.
+    size_t numOfDroppedRecords;
+} CUpti_SassMetricsDisable_Params;
+#define CUpti_SassMetricsDisable_Params_STRUCT_SIZE                      CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsDisable_Params, numOfDroppedRecords)
+
+/**
+ * \brief SASS metric data collection disable API will mark the end of a range, any kernel launched after this
+ * API call will not be profiled for the SASS metrics.
+ *
+ * \param pParams A pointer to \ref CUpti_SassMetricsDisable_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called multiple times for a cuda context without calling 
+ * cuptiSassMetricsEnable() API or called before cuptiSassMetricsSetConfig() API call.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsDisable(CUpti_SassMetricsDisable_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsGetDataProperties
+ */
+typedef struct CUpti_SassMetricsGetDataProperties_Params
+{
+    /// [in] equal to CUpti_SassMetricsGetDataProperties_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection was enabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [out] total number of SASS records has been collected
+    size_t numOfPatchedInstructionRecords;
+    /// [out] number of instances for each metric value per instruction.
+    /// This will depend on CUpti_SassPatching_OutputGranularity level set for the metric config.
+    size_t numOfInstances;
+} CUpti_SassMetricsGetDataProperties_Params;
+
+#define CUpti_SassMetricsGetDataProperties_Params_STRUCT_SIZE           CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsGetDataProperties_Params, numOfInstances)
+/**
+ * \brief SASS metric data properties API will give the data regarding number of instances of a metric
+ * value and number of SASS instruction data has been collected. The number of instances of a metric
+ * will vary as per user set the output granularity level with CUpti_SassMetrics_OutputGranularity value.
+ * User need to allocate memory for retriving the SASS data using cuptiSassMetricsFlushData() API.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetricsGetDataProperties_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called outside the enable/disable range.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetDataProperties(CUpti_SassMetricsGetDataProperties_Params* pParams);
+
+typedef struct CUpti_SassMetrics_InstanceValue
+{
+    // unique id of the metric
+    uint64_t metricId;
+    // metric value 
+    uint64_t value;
+} CUpti_SassMetrics_InstanceValue;
+#define CUpti_SassMetrics_InstanceValue_STRUCT_SIZE                      CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_InstanceValue, value)
+
+typedef struct CUpti_SassMetrics_Data
+{
+    /// [in] equal to CUpti_SassMetricsFlushData_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [out] Unique cubin id
+    uint32_t cubinCrc;
+    /// [out] function's unique symbol index in the module.
+    uint32_t functionIndex;
+    /// [out] The function name
+    const char* functionName;
+    /// [out] pc offset for the function in a module
+    uint32_t pcOffset;
+    /// [out] array of size equal to number of instances per metric, which contains the metric ID and metric value.
+    CUpti_SassMetrics_InstanceValue* pInstanceValues;
+} CUpti_SassMetrics_Data;
+
+/**
+ * \brief Params for cuptiSassMetricsFlushData
+ */
+typedef struct CUpti_SassMetricsFlushData_Params
+{
+    /// [in] equal to CUpti_SassMetricsFlushData_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection was enabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [in] number of patched instruction record will be retrived, user can call cuptiSassMetricsGetDataProperties()
+    /// for getting total number of records available.
+    size_t numOfPatchedInstructionRecords;
+    /// [in] number of patched instruction record instances for a metric, user can call cuptiSassMetricsGetDataProperties()
+    /// for getting total number of instances for each record per metric available.
+    size_t numOfInstances;
+    /// [out] 
+    CUpti_SassMetrics_Data* pMetricsData;
+} CUpti_SassMetricsFlushData_Params;
+#define CUpti_SassMetricsFlushData_Params_STRUCT_SIZE                      CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsFlushData_Params, numOfInstances)
+
+/**
+ * \brief Flush SASS metrics data from CUPTI internal buffer to the user buffer.
+ * User needs to allocate the buffer for retrieving the data. The number of records collected
+ * can be queried using the API cuptiSassMetricsGetDataProperties().
+ *
+ * \param pParams A pointer to \ref CUpti_SassMetricsFlushData_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection.
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called outside the enable/disable range.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsFlushData(CUpti_SassMetricsFlushData_Params* pParams);
+
+/** @} */ /* END CUPTI_SASS_METRICS_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // _CUPTI_SASS_METRICS_H_
\ No newline at end of file
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_target.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_target.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4b625d45c65288fa2ea7dc05819ee4dfc4cbdd3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_target.h
@@ -0,0 +1,43 @@
+#if !defined(_CUPTI_TARGET_H_)
+#define _CUPTI_TARGET_H_
+
+/*
+CUPTI profiler target API's
+This file contains the CUPTI profiling API's.
+*/
+#include <cupti_result.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+#ifndef CUPTI_PROFILER_STRUCT_SIZE
+#define CUPTI_PROFILER_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+typedef struct CUpti_Device_GetChipName_Params
+{
+    size_t structSize;                                      //!< [in]
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    size_t deviceIndex;                                     //!< [in]
+    const char* pChipName;                                  //!< [out]
+} CUpti_Device_GetChipName_Params;
+
+#define CUpti_Device_GetChipName_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Device_GetChipName_Params, pChipName)
+CUptiResult CUPTIAPI cuptiDeviceGetChipName(CUpti_Device_GetChipName_Params *pParams);
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_version.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a8808ea022b4116a1177e6f78d34d0f39604344
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_version.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2010-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_VERSION_H_)
+#define _CUPTI_VERSION_H_
+
+#include <cuda_stdint.h>
+#include <cupti_result.h>
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_VERSION_API CUPTI Version
+ * Function and macro to determine the CUPTI version.
+ * @{
+ */
+
+/**
+ * \brief The API version for this implementation of CUPTI.
+ *
+ * The API version for this implementation of CUPTI. This define along
+ * with \ref cuptiGetVersion can be used to dynamically detect if the
+ * version of CUPTI compiled against matches the version of the loaded
+ * CUPTI library.
+ *
+ * v1 : CUDAToolsSDK 4.0
+ * v2 : CUDAToolsSDK 4.1
+ * v3 : CUDA Toolkit 5.0
+ * v4 : CUDA Toolkit 5.5
+ * v5 : CUDA Toolkit 6.0
+ * v6 : CUDA Toolkit 6.5
+ * v7 : CUDA Toolkit 6.5(with sm_52 support)
+ * v8 : CUDA Toolkit 7.0
+ * v9 : CUDA Toolkit 8.0
+ * v10 : CUDA Toolkit 9.0
+ * v11 : CUDA Toolkit 9.1
+ * v12 : CUDA Toolkit 10.0, 10.1 and 10.2
+ * v13 : CUDA Toolkit 11.0
+ * v14 : CUDA Toolkit 11.1
+ * v15 : CUDA Toolkit 11.2, 11.3 and 11.4
+ * v16 : CUDA Toolkit 11.5
+ * v17 : CUDA Toolkit 11.6
+ * v18 : CUDA Toolkit 11.8
+ * v19 : CUDA Toolkit 12.0
+ * v20 : CUDA Toolkit 12.2
+ * v21 : CUDA Toolkit 12.3
+ * v22 : CUDA Toolkit 12.4
+ * v23 : CUDA Toolkit 12.5
+ * v24 : CUDA Toolkit 12.6
+ * v26 : CUDA Toolkit 12.8
+ */
+#define CUPTI_API_VERSION 26
+
+/**
+ * \brief Get the CUPTI API version.
+ *
+ * Return the API version in \p *version.
+ *
+ * \param version Returns the version
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p version is NULL
+ * \sa CUPTI_API_VERSION
+ */
+CUptiResult CUPTIAPI cuptiGetVersion(uint32_t *version);
+
+/** @} */ /* END CUPTI_VERSION_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_VERSION_H_*/
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_atomic_functions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fa21ad8c1caef27fe00c315759f9379c247302c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_atomic_functions.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_ATOMIC_FUNCTIONS_H__)
+#define __DEVICE_ATOMIC_FUNCTIONS_H__
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__
+#elif defined(_NVHPC_CUDA)
+# define __DEVICE_ATOMIC_FUNCTIONS_DECL__ extern __device__ __cudart_builtin__
+#else /* __CUDACC_RTC__ */
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/* Add !defined(_NVHPC_CUDA) to avoid empty function definition in PGI CUDA
+ * C++ compiler where the macro __CUDA_ARCH__ is not defined. */
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val) __DEF_IF_HOST
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on compute_70 and above, and should be replaced with "#x"_sync()."\
+    "To continue using "#x"(), specify virtual architecture compute_60 when targeting sm_70 and above, for example, using the pair of compiler options: -arch=compute_60 -code=sm_70."
+#elif defined(_NVHPC_CUDA)
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on cc70 and above, and should be replaced with "#x"_sync()."
+#else
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
+#endif
+
+extern "C"
+{
+extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__any)) int __any(int cond);
+extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__all)) int __all(int cond);
+}
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__any)) bool any(bool cond) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__all)) bool all(bool cond) __DEF_IF_HOST
+
+#undef __DEPRECATED__
+#undef __WSB_DEPRECATION_MESSAGE
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "device_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
+
+#undef EXCLUDE_FROM_RTC
+
+#endif /* !__DEVICE_ATOMIC_FUNCTIONS_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_atomic_functions.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..153ac712aab4288e4c16dd229460b677e7b61152
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_atomic_functions.hpp
@@ -0,0 +1,254 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__)
+#define __DEVICE_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+
+extern "C"
+{
+extern __device__ __device_builtin__ int          __iAtomicAdd(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicAdd(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicExch(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicExch(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ float        __fAtomicExch(float *address, float val);
+extern __device__ __device_builtin__ int          __iAtomicMin(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicMin(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicMax(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicMax(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicInc(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicDec(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicAnd(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicAnd(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicOr(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicOr(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicXor(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicXor(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicCAS(int *address, int compare, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicCAS(unsigned int *address, unsigned int compare, unsigned int val);
+
+
+extern __device__ __device_builtin__ unsigned long long int __ullAtomicAdd(unsigned long long int *address, unsigned long long int val);
+extern __device__ __device_builtin__ unsigned long long int __ullAtomicExch(unsigned long long int *address, unsigned long long int val);
+extern __device__ __device_builtin__ unsigned long long int __ullAtomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val);
+}
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val)
+{
+  return __iAtomicAdd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val)
+{
+  return __iAtomicAdd(address, (unsigned int)-(int)val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd(address, (unsigned int)-(int)val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val)
+{
+  return __iAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val)
+{
+  return __fAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val)
+{
+  return __iAtomicMin(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val)
+{
+  return __iAtomicMax(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val)
+{
+  return __iAtomicAnd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val)
+{
+  return __iAtomicOr(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val)
+{
+  return __iAtomicXor(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val)
+{
+  return __iAtomicCAS(address, compare, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
+{
+  return __uAtomicCAS(address, compare, val);
+}
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val)
+{
+  return __ullAtomicAdd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val)
+{
+  return __ullAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
+{
+  return __ullAtomicCAS(address, compare, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond)
+{
+  return (bool)__any((int)cond);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond)
+{
+  return (bool)__all((int)cond);
+}
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_double_functions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_double_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..82b25e59b40aeaf1e475ff3179e49640a44918b8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_double_functions.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("device_double_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "device_double_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
+#endif
+
+#include "crt/device_double_functions.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_launch_parameters.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_launch_parameters.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f552db8faab7d21e90e06a1ea2184a5563d3bf2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_launch_parameters.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_LAUNCH_PARAMETERS_H__)
+#define __DEVICE_LAUNCH_PARAMETERS_H__
+
+#include "vector_types.h"
+
+#if !defined(__STORAGE__)
+
+#if defined(__CUDACC_RTC__)
+#define __STORAGE__ \
+        extern const __device__
+#else /* !__CUDACC_RTC__ */
+#define __STORAGE__ \
+        extern const
+#endif /* __CUDACC_RTC__ */
+
+#endif /* __STORAGE__ */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+uint3 __device_builtin__ __STORAGE__ threadIdx;
+uint3 __device_builtin__ __STORAGE__ blockIdx;
+dim3 __device_builtin__ __STORAGE__ blockDim;
+dim3 __device_builtin__ __STORAGE__ gridDim;
+int __device_builtin__ __STORAGE__ warpSize;
+
+#undef __STORAGE__
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#if !defined(__cudaGet_threadIdx)
+
+#define __cudaGet_threadIdx() \
+        threadIdx
+
+#endif /* __cudaGet_threadIdx */
+
+#if !defined(__cudaGet_blockIdx)
+
+#define __cudaGet_blockIdx() \
+        blockIdx
+
+#endif /* __cudaGet_blockIdx */
+
+#if !defined(__cudaGet_blockDim)
+
+#define __cudaGet_blockDim() \
+        blockDim
+
+#endif /* __cudaGet_blockDim */
+
+#if !defined(__cudaGet_gridDim)
+
+#define __cudaGet_gridDim() \
+        gridDim
+
+#endif /* __cudaGet_gridDim */
+
+#if !defined(__cudaGet_warpSize)
+
+#define __cudaGet_warpSize() \
+        warpSize
+
+#endif /* __cudaGet_warpSize */
+
+#endif /* !__DEVICE_LAUNCH_PARAMETERS_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b575a1014c6cdb9bf2f722c2a67e329186079e6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_types.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_TYPES_H__)
+#define __DEVICE_TYPES_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
+#endif
+
+#ifndef __DOXYGEN_ONLY__
+#include "crt/host_defines.h"
+#endif
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+enum __device_builtin__ cudaRoundMode
+{
+    cudaRoundNearest,
+    cudaRoundZero,
+    cudaRoundPosInf,
+    cudaRoundMinInf
+};
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
+#endif
+
+#endif /* !__DEVICE_TYPES_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/driver_functions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/driver_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..94767974220594550d496cad4d14c45349b27737
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/driver_functions.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DRIVER_FUNCTIONS_H__)
+#define __DRIVER_FUNCTIONS_H__
+
+#include "builtin_types.h"
+#include "crt/host_defines.h"
+#include "driver_types.h"
+
+/**
+ * \addtogroup CUDART_MEMORY
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns a cudaPitchedPtr based on input parameters
+ *
+ * Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
+ * \p p, \p xsz, and \p ysz.
+ *
+ * \param d   - Pointer to allocated memory
+ * \param p   - Pitch of allocated memory in bytes
+ * \param xsz - Logical width of allocation in elements
+ * \param ysz - Logical height of allocation in elements
+ *
+ * \return
+ * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
+ *
+ * \sa make_cudaExtent, make_cudaPos
+ */
+static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz) 
+{
+  struct cudaPitchedPtr s;
+
+  s.ptr   = d;
+  s.pitch = p;
+  s.xsize = xsz;
+  s.ysize = ysz;
+
+  return s;
+}
+
+/**
+ * \brief Returns a cudaPos based on input parameters
+ *
+ * Returns a ::cudaPos based on the specified input parameters \p x,
+ * \p y, and \p z.
+ *
+ * \param x - X position
+ * \param y - Y position
+ * \param z - Z position
+ *
+ * \return
+ * ::cudaPos specified by \p x, \p y, and \p z
+ *
+ * \sa make_cudaExtent, make_cudaPitchedPtr
+ */
+static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z) 
+{
+  struct cudaPos p;
+
+  p.x = x;
+  p.y = y;
+  p.z = z;
+
+  return p;
+}
+
+/**
+ * \brief Returns a cudaExtent based on input parameters
+ *
+ * Returns a ::cudaExtent based on the specified input parameters \p w,
+ * \p h, and \p d.
+ *
+ * \param w - Width in elements when referring to array memory, in bytes when referring to linear memory
+ * \param h - Height in elements
+ * \param d - Depth in elements
+ *
+ * \return
+ * ::cudaExtent specified by \p w, \p h, and \p d
+ *
+ * \sa make_cudaPitchedPtr, make_cudaPos
+ */
+static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d) 
+{
+  struct cudaExtent e;
+
+  e.width  = w;
+  e.height = h;
+  e.depth  = d;
+
+  return e;
+}
+
+/** @} */ /* END CUDART_MEMORY */
+
+#endif /* !__DRIVER_FUNCTIONS_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/fatbinary_section.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/fatbinary_section.h
new file mode 100644
index 0000000000000000000000000000000000000000..c017f98f9d668003c0e73fa513c095bb6e717800
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/fatbinary_section.h
@@ -0,0 +1,61 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2010-2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#ifndef fatbinary_section_INCLUDED
+#define fatbinary_section_INCLUDED
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * These defines are for the fatbin.c runtime wrapper
+ */
+#define FATBINC_MAGIC   0x466243B1
+#define FATBINC_VERSION 1
+#define FATBINC_LINK_VERSION 2
+
+typedef struct {
+  int magic;
+  int version;
+  const unsigned long long* data;
+  void *filename_or_fatbins;  /* version 1: offline filename,
+                               * version 2: array of prelinked fatbins */
+} __fatBinC_Wrapper_t;
+
+/*
+ * The section that contains the fatbin control structure
+ */
+#ifdef STD_OS_Darwin
+/* mach-o sections limited to 15 chars, and want __ prefix else strip complains, * so use a different name */
+#define FATBIN_CONTROL_SECTION_NAME     "__fatbin"
+#define FATBIN_DATA_SECTION_NAME        "__nv_fatbin"
+/* only need segment name for mach-o */
+#define FATBIN_SEGMENT_NAME             "__NV_CUDA"
+#else
+#define FATBIN_CONTROL_SECTION_NAME     ".nvFatBinSegment"
+/*
+ * The section that contains the fatbin data itself
+ * (put in separate section so easy to find)
+ */
+#define FATBIN_DATA_SECTION_NAME        ".nv_fatbin"
+#endif
+/* section for pre-linked relocatable fatbin data */
+#define FATBIN_PRELINK_DATA_SECTION_NAME "__nv_relfatbin"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* fatbinary_section_INCLUDED */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudaGL_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudaGL_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a52e194b265d32f61d47bd3081f4958755bff46
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudaGL_meta.h
@@ -0,0 +1,116 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// Dependent includes
+#ifdef __APPLE__
+#include <OpenGL/gl.h>
+#else
+#include <GL/gl.h>
+#endif
+
+// CUDA public interface, for type definitions and cu* function prototypes
+#include "cudaGL.h"
+
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+typedef struct cuGraphicsGLRegisterBuffer_params_st {
+    CUgraphicsResource *pCudaResource;
+    GLuint buffer;
+    unsigned int Flags;
+} cuGraphicsGLRegisterBuffer_params;
+
+typedef struct cuGraphicsGLRegisterImage_params_st {
+    CUgraphicsResource *pCudaResource;
+    GLuint image;
+    GLenum target;
+    unsigned int Flags;
+} cuGraphicsGLRegisterImage_params;
+
+typedef struct cuGLGetDevices_v2_params_st {
+    unsigned int *pCudaDeviceCount;
+    CUdevice *pCudaDevices;
+    unsigned int cudaDeviceCount;
+    CUGLDeviceList deviceList;
+} cuGLGetDevices_v2_params;
+
+typedef struct cuGLCtxCreate_v2_params_st {
+    CUcontext *pCtx;
+    unsigned int Flags;
+    CUdevice device;
+} cuGLCtxCreate_v2_params;
+
+typedef struct cuGLRegisterBufferObject_params_st {
+    GLuint buffer;
+} cuGLRegisterBufferObject_params;
+
+typedef struct cuGLMapBufferObject_v2_ptds_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+} cuGLMapBufferObject_v2_ptds_params;
+
+typedef struct cuGLUnmapBufferObject_params_st {
+    GLuint buffer;
+} cuGLUnmapBufferObject_params;
+
+typedef struct cuGLUnregisterBufferObject_params_st {
+    GLuint buffer;
+} cuGLUnregisterBufferObject_params;
+
+typedef struct cuGLSetBufferObjectMapFlags_params_st {
+    GLuint buffer;
+    unsigned int Flags;
+} cuGLSetBufferObjectMapFlags_params;
+
+typedef struct cuGLMapBufferObjectAsync_v2_ptsz_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+    CUstream hStream;
+} cuGLMapBufferObjectAsync_v2_ptsz_params;
+
+typedef struct cuGLUnmapBufferObjectAsync_params_st {
+    GLuint buffer;
+    CUstream hStream;
+} cuGLUnmapBufferObjectAsync_params;
+
+typedef struct cuGLGetDevices_params_st {
+    unsigned int *pCudaDeviceCount;
+    CUdevice *pCudaDevices;
+    unsigned int cudaDeviceCount;
+    CUGLDeviceList deviceList;
+} cuGLGetDevices_params;
+
+typedef struct cuGLMapBufferObject_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+} cuGLMapBufferObject_v2_params;
+
+typedef struct cuGLMapBufferObjectAsync_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+    CUstream hStream;
+} cuGLMapBufferObjectAsync_v2_params;
+
+typedef struct cuGLCtxCreate_params_st {
+    CUcontext *pCtx;
+    unsigned int Flags;
+    CUdevice device;
+} cuGLCtxCreate_params;
+
+typedef struct cuGLMapBufferObject_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *size;
+    GLuint buffer;
+} cuGLMapBufferObject_params;
+
+typedef struct cuGLMapBufferObjectAsync_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *size;
+    GLuint buffer;
+    CUstream hStream;
+} cuGLMapBufferObjectAsync_params;
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..954db0ad73e2eb029918f595ddee452aa9afd0e3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_meta.h
@@ -0,0 +1,3718 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// No dependent includes
+
+// CUDA public interface, for type definitions and cu* function prototypes
+#include "cuda.h"
+
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+typedef struct cuGetErrorString_params_st {
+    CUresult error;
+    const char **pStr;
+} cuGetErrorString_params;
+
+typedef struct cuGetErrorName_params_st {
+    CUresult error;
+    const char **pStr;
+} cuGetErrorName_params;
+
+typedef struct cuInit_params_st {
+    unsigned int Flags;
+} cuInit_params;
+
+typedef struct cuDriverGetVersion_params_st {
+    int *driverVersion;
+} cuDriverGetVersion_params;
+
+typedef struct cuDeviceGet_params_st {
+    CUdevice *device;
+    int ordinal;
+} cuDeviceGet_params;
+
+typedef struct cuDeviceGetCount_params_st {
+    int *count;
+} cuDeviceGetCount_params;
+
+typedef struct cuDeviceGetName_params_st {
+    char *name;
+    int len;
+    CUdevice dev;
+} cuDeviceGetName_params;
+
+typedef struct cuDeviceGetUuid_params_st {
+    CUuuid *uuid;
+    CUdevice dev;
+} cuDeviceGetUuid_params;
+
+typedef struct cuDeviceGetUuid_v2_params_st {
+    CUuuid *uuid;
+    CUdevice dev;
+} cuDeviceGetUuid_v2_params;
+
+typedef struct cuDeviceGetLuid_params_st {
+    char *luid;
+    unsigned int *deviceNodeMask;
+    CUdevice dev;
+} cuDeviceGetLuid_params;
+
+typedef struct cuDeviceTotalMem_v2_params_st {
+    size_t *bytes;
+    CUdevice dev;
+} cuDeviceTotalMem_v2_params;
+
+typedef struct cuDeviceGetTexture1DLinearMaxWidth_params_st {
+    size_t *maxWidthInElements;
+    CUarray_format format;
+    unsigned numChannels;
+    CUdevice dev;
+} cuDeviceGetTexture1DLinearMaxWidth_params;
+
+typedef struct cuDeviceGetAttribute_params_st {
+    int *pi;
+    CUdevice_attribute attrib;
+    CUdevice dev;
+} cuDeviceGetAttribute_params;
+
+typedef struct cuDeviceGetNvSciSyncAttributes_params_st {
+    void *nvSciSyncAttrList;
+    CUdevice dev;
+    int flags;
+} cuDeviceGetNvSciSyncAttributes_params;
+
+typedef struct cuDeviceSetMemPool_params_st {
+    CUdevice dev;
+    CUmemoryPool pool;
+} cuDeviceSetMemPool_params;
+
+typedef struct cuDeviceGetMemPool_params_st {
+    CUmemoryPool *pool;
+    CUdevice dev;
+} cuDeviceGetMemPool_params;
+
+typedef struct cuDeviceGetDefaultMemPool_params_st {
+    CUmemoryPool *pool_out;
+    CUdevice dev;
+} cuDeviceGetDefaultMemPool_params;
+
+typedef struct cuDeviceGetExecAffinitySupport_params_st {
+    int *pi;
+    CUexecAffinityType type;
+    CUdevice dev;
+} cuDeviceGetExecAffinitySupport_params;
+
+typedef struct cuFlushGPUDirectRDMAWrites_params_st {
+    CUflushGPUDirectRDMAWritesTarget target;
+    CUflushGPUDirectRDMAWritesScope scope;
+} cuFlushGPUDirectRDMAWrites_params;
+
+typedef struct cuDeviceGetProperties_params_st {
+    CUdevprop *prop;
+    CUdevice dev;
+} cuDeviceGetProperties_params;
+
+typedef struct cuDeviceComputeCapability_params_st {
+    int *major;
+    int *minor;
+    CUdevice dev;
+} cuDeviceComputeCapability_params;
+
+typedef struct cuDevicePrimaryCtxRetain_params_st {
+    CUcontext *pctx;
+    CUdevice dev;
+} cuDevicePrimaryCtxRetain_params;
+
+typedef struct cuDevicePrimaryCtxRelease_v2_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxRelease_v2_params;
+
+typedef struct cuDevicePrimaryCtxSetFlags_v2_params_st {
+    CUdevice dev;
+    unsigned int flags;
+} cuDevicePrimaryCtxSetFlags_v2_params;
+
+typedef struct cuDevicePrimaryCtxGetState_params_st {
+    CUdevice dev;
+    unsigned int *flags;
+    int *active;
+} cuDevicePrimaryCtxGetState_params;
+
+typedef struct cuDevicePrimaryCtxReset_v2_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxReset_v2_params;
+
+typedef struct cuCtxCreate_v2_params_st {
+    CUcontext *pctx;
+    unsigned int flags;
+    CUdevice dev;
+} cuCtxCreate_v2_params;
+
+typedef struct cuCtxCreate_v3_params_st {
+    CUcontext *pctx;
+    CUexecAffinityParam *paramsArray;
+    int numParams;
+    unsigned int flags;
+    CUdevice dev;
+} cuCtxCreate_v3_params;
+
+typedef struct cuCtxCreate_v4_params_st {
+    CUcontext *pctx;
+    CUctxCreateParams *ctxCreateParams;
+    unsigned int flags;
+    CUdevice dev;
+} cuCtxCreate_v4_params;
+
+typedef struct cuCtxDestroy_v2_params_st {
+    CUcontext ctx;
+} cuCtxDestroy_v2_params;
+
+typedef struct cuCtxPushCurrent_v2_params_st {
+    CUcontext ctx;
+} cuCtxPushCurrent_v2_params;
+
+typedef struct cuCtxPopCurrent_v2_params_st {
+    CUcontext *pctx;
+} cuCtxPopCurrent_v2_params;
+
+typedef struct cuCtxSetCurrent_params_st {
+    CUcontext ctx;
+} cuCtxSetCurrent_params;
+
+typedef struct cuCtxGetCurrent_params_st {
+    CUcontext *pctx;
+} cuCtxGetCurrent_params;
+
+typedef struct cuCtxGetDevice_params_st {
+    CUdevice *device;
+} cuCtxGetDevice_params;
+
+typedef struct cuCtxGetFlags_params_st {
+    unsigned int *flags;
+} cuCtxGetFlags_params;
+
+typedef struct cuCtxSetFlags_params_st {
+    unsigned int flags;
+} cuCtxSetFlags_params;
+
+typedef struct cuCtxGetId_params_st {
+    CUcontext ctx;
+    unsigned long long *ctxId;
+} cuCtxGetId_params;
+
+typedef struct cuCtxSetLimit_params_st {
+    CUlimit limit;
+    size_t value;
+} cuCtxSetLimit_params;
+
+typedef struct cuCtxGetLimit_params_st {
+    size_t *pvalue;
+    CUlimit limit;
+} cuCtxGetLimit_params;
+
+typedef struct cuCtxGetCacheConfig_params_st {
+    CUfunc_cache *pconfig;
+} cuCtxGetCacheConfig_params;
+
+typedef struct cuCtxSetCacheConfig_params_st {
+    CUfunc_cache config;
+} cuCtxSetCacheConfig_params;
+
+typedef struct cuCtxGetApiVersion_params_st {
+    CUcontext ctx;
+    unsigned int *version;
+} cuCtxGetApiVersion_params;
+
+typedef struct cuCtxGetStreamPriorityRange_params_st {
+    int *leastPriority;
+    int *greatestPriority;
+} cuCtxGetStreamPriorityRange_params;
+
+typedef struct cuCtxGetExecAffinity_params_st {
+    CUexecAffinityParam *pExecAffinity;
+    CUexecAffinityType type;
+} cuCtxGetExecAffinity_params;
+
+typedef struct cuCtxRecordEvent_params_st {
+    CUcontext hCtx;
+    CUevent hEvent;
+} cuCtxRecordEvent_params;
+
+typedef struct cuCtxWaitEvent_params_st {
+    CUcontext hCtx;
+    CUevent hEvent;
+} cuCtxWaitEvent_params;
+
+typedef struct cuCtxAttach_params_st {
+    CUcontext *pctx;
+    unsigned int flags;
+} cuCtxAttach_params;
+
+typedef struct cuCtxDetach_params_st {
+    CUcontext ctx;
+} cuCtxDetach_params;
+
+typedef struct cuCtxGetSharedMemConfig_params_st {
+    CUsharedconfig *pConfig;
+} cuCtxGetSharedMemConfig_params;
+
+typedef struct cuCtxSetSharedMemConfig_params_st {
+    CUsharedconfig config;
+} cuCtxSetSharedMemConfig_params;
+
+typedef struct cuModuleLoad_params_st {
+    CUmodule *module;
+    const char *fname;
+} cuModuleLoad_params;
+
+typedef struct cuModuleLoadData_params_st {
+    CUmodule *module;
+    const void *image;
+} cuModuleLoadData_params;
+
+typedef struct cuModuleLoadDataEx_params_st {
+    CUmodule *module;
+    const void *image;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuModuleLoadDataEx_params;
+
+typedef struct cuModuleLoadFatBinary_params_st {
+    CUmodule *module;
+    const void *fatCubin;
+} cuModuleLoadFatBinary_params;
+
+typedef struct cuModuleUnload_params_st {
+    CUmodule hmod;
+} cuModuleUnload_params;
+
+typedef struct cuModuleGetLoadingMode_params_st {
+    CUmoduleLoadingMode *mode;
+} cuModuleGetLoadingMode_params;
+
+typedef struct cuModuleGetFunction_params_st {
+    CUfunction *hfunc;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetFunction_params;
+
+typedef struct cuModuleGetFunctionCount_params_st {
+    unsigned int *count;
+    CUmodule mod;
+} cuModuleGetFunctionCount_params;
+
+typedef struct cuModuleEnumerateFunctions_params_st {
+    CUfunction *functions;
+    unsigned int numFunctions;
+    CUmodule mod;
+} cuModuleEnumerateFunctions_params;
+
+typedef struct cuModuleGetGlobal_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *bytes;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetGlobal_v2_params;
+
+typedef struct cuLinkCreate_v2_params_st {
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+    CUlinkState *stateOut;
+} cuLinkCreate_v2_params;
+
+typedef struct cuLinkAddData_v2_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    void *data;
+    size_t size;
+    const char *name;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddData_v2_params;
+
+typedef struct cuLinkAddFile_v2_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    const char *path;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddFile_v2_params;
+
+typedef struct cuLinkComplete_params_st {
+    CUlinkState state;
+    void **cubinOut;
+    size_t *sizeOut;
+} cuLinkComplete_params;
+
+typedef struct cuLinkDestroy_params_st {
+    CUlinkState state;
+} cuLinkDestroy_params;
+
+typedef struct cuModuleGetTexRef_params_st {
+    CUtexref *pTexRef;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetTexRef_params;
+
+typedef struct cuModuleGetSurfRef_params_st {
+    CUsurfref *pSurfRef;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetSurfRef_params;
+
+typedef struct cuLibraryLoadData_params_st {
+    CUlibrary *library;
+    const void *code;
+    CUjit_option *jitOptions;
+    void **jitOptionsValues;
+    unsigned int numJitOptions;
+    CUlibraryOption *libraryOptions;
+    void **libraryOptionValues;
+    unsigned int numLibraryOptions;
+} cuLibraryLoadData_params;
+
+typedef struct cuLibraryLoadFromFile_params_st {
+    CUlibrary *library;
+    const char *fileName;
+    CUjit_option *jitOptions;
+    void **jitOptionsValues;
+    unsigned int numJitOptions;
+    CUlibraryOption *libraryOptions;
+    void **libraryOptionValues;
+    unsigned int numLibraryOptions;
+} cuLibraryLoadFromFile_params;
+
+typedef struct cuLibraryUnload_params_st {
+    CUlibrary library;
+} cuLibraryUnload_params;
+
+typedef struct cuLibraryGetKernel_params_st {
+    CUkernel *pKernel;
+    CUlibrary library;
+    const char *name;
+} cuLibraryGetKernel_params;
+
+typedef struct cuLibraryGetKernelCount_params_st {
+    unsigned int *count;
+    CUlibrary lib;
+} cuLibraryGetKernelCount_params;
+
+typedef struct cuLibraryEnumerateKernels_params_st {
+    CUkernel *kernels;
+    unsigned int numKernels;
+    CUlibrary lib;
+} cuLibraryEnumerateKernels_params;
+
+typedef struct cuLibraryGetModule_params_st {
+    CUmodule *pMod;
+    CUlibrary library;
+} cuLibraryGetModule_params;
+
+typedef struct cuKernelGetFunction_params_st {
+    CUfunction *pFunc;
+    CUkernel kernel;
+} cuKernelGetFunction_params;
+
+typedef struct cuKernelGetLibrary_params_st {
+    CUlibrary *pLib;
+    CUkernel kernel;
+} cuKernelGetLibrary_params;
+
+typedef struct cuLibraryGetGlobal_params_st {
+    CUdeviceptr *dptr;
+    size_t *bytes;
+    CUlibrary library;
+    const char *name;
+} cuLibraryGetGlobal_params;
+
+typedef struct cuLibraryGetManaged_params_st {
+    CUdeviceptr *dptr;
+    size_t *bytes;
+    CUlibrary library;
+    const char *name;
+} cuLibraryGetManaged_params;
+
+typedef struct cuLibraryGetUnifiedFunction_params_st {
+    void **fptr;
+    CUlibrary library;
+    const char *symbol;
+} cuLibraryGetUnifiedFunction_params;
+
+typedef struct cuKernelGetAttribute_params_st {
+    int *pi;
+    CUfunction_attribute attrib;
+    CUkernel kernel;
+    CUdevice dev;
+} cuKernelGetAttribute_params;
+
+typedef struct cuKernelSetAttribute_params_st {
+    CUfunction_attribute attrib;
+    int val;
+    CUkernel kernel;
+    CUdevice dev;
+} cuKernelSetAttribute_params;
+
+typedef struct cuKernelSetCacheConfig_params_st {
+    CUkernel kernel;
+    CUfunc_cache config;
+    CUdevice dev;
+} cuKernelSetCacheConfig_params;
+
+typedef struct cuKernelGetName_params_st {
+    const char **name;
+    CUkernel hfunc;
+} cuKernelGetName_params;
+
+typedef struct cuKernelGetParamInfo_params_st {
+    CUkernel kernel;
+    size_t paramIndex;
+    size_t *paramOffset;
+    size_t *paramSize;
+} cuKernelGetParamInfo_params;
+
+typedef struct cuMemGetInfo_v2_params_st {
+    size_t *free;
+    size_t *total;
+} cuMemGetInfo_v2_params;
+
+typedef struct cuMemAlloc_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+} cuMemAlloc_v2_params;
+
+typedef struct cuMemAllocPitch_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *pPitch;
+    size_t WidthInBytes;
+    size_t Height;
+    unsigned int ElementSizeBytes;
+} cuMemAllocPitch_v2_params;
+
+typedef struct cuMemFree_v2_params_st {
+    CUdeviceptr dptr;
+} cuMemFree_v2_params;
+
+typedef struct cuMemGetAddressRange_v2_params_st {
+    CUdeviceptr *pbase;
+    size_t *psize;
+    CUdeviceptr dptr;
+} cuMemGetAddressRange_v2_params;
+
+typedef struct cuMemAllocHost_v2_params_st {
+    void **pp;
+    size_t bytesize;
+} cuMemAllocHost_v2_params;
+
+typedef struct cuMemFreeHost_params_st {
+    void *p;
+} cuMemFreeHost_params;
+
+typedef struct cuMemHostAlloc_params_st {
+    void **pp;
+    size_t bytesize;
+    unsigned int Flags;
+} cuMemHostAlloc_params;
+
+typedef struct cuMemHostGetDevicePointer_v2_params_st {
+    CUdeviceptr *pdptr;
+    void *p;
+    unsigned int Flags;
+} cuMemHostGetDevicePointer_v2_params;
+
+typedef struct cuMemHostGetFlags_params_st {
+    unsigned int *pFlags;
+    void *p;
+} cuMemHostGetFlags_params;
+
+typedef struct cuMemAllocManaged_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    unsigned int flags;
+} cuMemAllocManaged_params;
+
+typedef struct cuDeviceRegisterAsyncNotification_params_st {
+    CUdevice device;
+    CUasyncCallback callbackFunc;
+    void *userData;
+    CUasyncCallbackHandle *callback;
+} cuDeviceRegisterAsyncNotification_params;
+
+typedef struct cuDeviceUnregisterAsyncNotification_params_st {
+    CUdevice device;
+    CUasyncCallbackHandle callback;
+} cuDeviceUnregisterAsyncNotification_params;
+
+typedef struct cuDeviceGetByPCIBusId_params_st {
+    CUdevice *dev;
+    const char *pciBusId;
+} cuDeviceGetByPCIBusId_params;
+
+typedef struct cuDeviceGetPCIBusId_params_st {
+    char *pciBusId;
+    int len;
+    CUdevice dev;
+} cuDeviceGetPCIBusId_params;
+
+typedef struct cuIpcGetEventHandle_params_st {
+    CUipcEventHandle *pHandle;
+    CUevent event;
+} cuIpcGetEventHandle_params;
+
+typedef struct cuIpcOpenEventHandle_params_st {
+    CUevent *phEvent;
+    CUipcEventHandle handle;
+} cuIpcOpenEventHandle_params;
+
+typedef struct cuIpcGetMemHandle_params_st {
+    CUipcMemHandle *pHandle;
+    CUdeviceptr dptr;
+} cuIpcGetMemHandle_params;
+
+typedef struct cuIpcOpenMemHandle_v2_params_st {
+    CUdeviceptr *pdptr;
+    CUipcMemHandle handle;
+    unsigned int Flags;
+} cuIpcOpenMemHandle_v2_params;
+
+typedef struct cuIpcCloseMemHandle_params_st {
+    CUdeviceptr dptr;
+} cuIpcCloseMemHandle_params;
+
+typedef struct cuMemHostRegister_v2_params_st {
+    void *p;
+    size_t bytesize;
+    unsigned int Flags;
+} cuMemHostRegister_v2_params;
+
+typedef struct cuMemHostUnregister_params_st {
+    void *p;
+} cuMemHostUnregister_params;
+
+typedef struct cuMemcpy_ptds_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+} cuMemcpy_ptds_params;
+
+typedef struct cuMemcpyPeer_ptds_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+} cuMemcpyPeer_ptds_params;
+
+typedef struct cuMemcpyHtoD_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoD_v2_ptds_params;
+
+typedef struct cuMemcpyDtoH_v2_ptds_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoH_v2_ptds_params;
+
+typedef struct cuMemcpyDtoD_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoD_v2_ptds_params;
+
+typedef struct cuMemcpyDtoA_v2_ptds_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoA_v2_ptds_params;
+
+typedef struct cuMemcpyAtoD_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoD_v2_ptds_params;
+
+typedef struct cuMemcpyHtoA_v2_ptds_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoA_v2_ptds_params;
+
+typedef struct cuMemcpyAtoH_v2_ptds_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoH_v2_ptds_params;
+
+typedef struct cuMemcpyAtoA_v2_ptds_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoA_v2_ptds_params;
+
+typedef struct cuMemcpy2D_v2_ptds_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2D_v2_ptds_params;
+
+typedef struct cuMemcpy2DUnaligned_v2_ptds_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2DUnaligned_v2_ptds_params;
+
+typedef struct cuMemcpy3D_v2_ptds_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+} cuMemcpy3D_v2_ptds_params;
+
+typedef struct cuMemcpy3DPeer_ptds_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+} cuMemcpy3DPeer_ptds_params;
+
+typedef struct cuMemcpyAsync_ptsz_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAsync_ptsz_params;
+
+typedef struct cuMemcpyPeerAsync_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyPeerAsync_ptsz_params;
+
+typedef struct cuMemcpyHtoDAsync_v2_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoDAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyDtoHAsync_v2_ptsz_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoHAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyDtoDAsync_v2_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoDAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyHtoAAsync_v2_ptsz_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoAAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyAtoHAsync_v2_ptsz_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAtoHAsync_v2_ptsz_params;
+
+typedef struct cuMemcpy2DAsync_v2_ptsz_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+    CUstream hStream;
+} cuMemcpy2DAsync_v2_ptsz_params;
+
+typedef struct cuMemcpy3DAsync_v2_ptsz_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+    CUstream hStream;
+} cuMemcpy3DAsync_v2_ptsz_params;
+
+typedef struct cuMemcpy3DPeerAsync_ptsz_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+    CUstream hStream;
+} cuMemcpy3DPeerAsync_ptsz_params;
+
+typedef struct cuMemcpyBatchAsync_ptsz_params_st {
+    CUdeviceptr *dsts;
+    CUdeviceptr *srcs;
+    size_t *sizes;
+    size_t count;
+    CUmemcpyAttributes *attrs;
+    size_t *attrsIdxs;
+    size_t numAttrs;
+    size_t *failIdx;
+    CUstream hStream;
+} cuMemcpyBatchAsync_ptsz_params;
+
+typedef struct cuMemcpy3DBatchAsync_ptsz_params_st {
+    size_t numOps;
+    CUDA_MEMCPY3D_BATCH_OP *opList;
+    size_t *failIdx;
+    unsigned long long flags;
+    CUstream hStream;
+} cuMemcpy3DBatchAsync_ptsz_params;
+
+typedef struct cuMemsetD8_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+} cuMemsetD8_v2_ptds_params;
+
+typedef struct cuMemsetD16_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+} cuMemsetD16_v2_ptds_params;
+
+typedef struct cuMemsetD32_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+} cuMemsetD32_v2_ptds_params;
+
+typedef struct cuMemsetD2D8_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D8_v2_ptds_params;
+
+typedef struct cuMemsetD2D16_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D16_v2_ptds_params;
+
+typedef struct cuMemsetD2D32_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D32_v2_ptds_params;
+
+typedef struct cuMemsetD8Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD8Async_ptsz_params;
+
+typedef struct cuMemsetD16Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD16Async_ptsz_params;
+
+typedef struct cuMemsetD32Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD32Async_ptsz_params;
+
+typedef struct cuMemsetD2D8Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D8Async_ptsz_params;
+
+typedef struct cuMemsetD2D16Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D16Async_ptsz_params;
+
+typedef struct cuMemsetD2D32Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D32Async_ptsz_params;
+
+typedef struct cuArrayCreate_v2_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY_DESCRIPTOR *pAllocateArray;
+} cuArrayCreate_v2_params;
+
+typedef struct cuArrayGetDescriptor_v2_params_st {
+    CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor;
+    CUarray hArray;
+} cuArrayGetDescriptor_v2_params;
+
+typedef struct cuArrayGetSparseProperties_params_st {
+    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties;
+    CUarray array;
+} cuArrayGetSparseProperties_params;
+
+typedef struct cuMipmappedArrayGetSparseProperties_params_st {
+    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties;
+    CUmipmappedArray mipmap;
+} cuMipmappedArrayGetSparseProperties_params;
+
+typedef struct cuArrayGetMemoryRequirements_params_st {
+    CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements;
+    CUarray array;
+    CUdevice device;
+} cuArrayGetMemoryRequirements_params;
+
+typedef struct cuMipmappedArrayGetMemoryRequirements_params_st {
+    CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements;
+    CUmipmappedArray mipmap;
+    CUdevice device;
+} cuMipmappedArrayGetMemoryRequirements_params;
+
+typedef struct cuArrayGetPlane_params_st {
+    CUarray *pPlaneArray;
+    CUarray hArray;
+    unsigned int planeIdx;
+} cuArrayGetPlane_params;
+
+typedef struct cuArrayDestroy_params_st {
+    CUarray hArray;
+} cuArrayDestroy_params;
+
+typedef struct cuArray3DCreate_v2_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray;
+} cuArray3DCreate_v2_params;
+
+typedef struct cuArray3DGetDescriptor_v2_params_st {
+    CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor;
+    CUarray hArray;
+} cuArray3DGetDescriptor_v2_params;
+
+typedef struct cuMipmappedArrayCreate_params_st {
+    CUmipmappedArray *pHandle;
+    const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc;
+    unsigned int numMipmapLevels;
+} cuMipmappedArrayCreate_params;
+
+typedef struct cuMipmappedArrayGetLevel_params_st {
+    CUarray *pLevelArray;
+    CUmipmappedArray hMipmappedArray;
+    unsigned int level;
+} cuMipmappedArrayGetLevel_params;
+
+typedef struct cuMipmappedArrayDestroy_params_st {
+    CUmipmappedArray hMipmappedArray;
+} cuMipmappedArrayDestroy_params;
+
+typedef struct cuMemGetHandleForAddressRange_params_st {
+    void *handle;
+    CUdeviceptr dptr;
+    size_t size;
+    CUmemRangeHandleType handleType;
+    unsigned long long flags;
+} cuMemGetHandleForAddressRange_params;
+
+typedef struct cuMemBatchDecompressAsync_ptsz_params_st {
+    CUmemDecompressParams *paramsArray;
+    size_t count;
+    unsigned int flags;
+    size_t *errorIndex;
+    CUstream stream;
+} cuMemBatchDecompressAsync_ptsz_params;
+
+typedef struct cuMemAddressReserve_params_st {
+    CUdeviceptr *ptr;
+    size_t size;
+    size_t alignment;
+    CUdeviceptr addr;
+    unsigned long long flags;
+} cuMemAddressReserve_params;
+
+typedef struct cuMemAddressFree_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+} cuMemAddressFree_params;
+
+typedef struct cuMemCreate_params_st {
+    CUmemGenericAllocationHandle *handle;
+    size_t size;
+    const CUmemAllocationProp *prop;
+    unsigned long long flags;
+} cuMemCreate_params;
+
+typedef struct cuMemRelease_params_st {
+    CUmemGenericAllocationHandle handle;
+} cuMemRelease_params;
+
+typedef struct cuMemMap_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+    size_t offset;
+    CUmemGenericAllocationHandle handle;
+    unsigned long long flags;
+} cuMemMap_params;
+
+typedef struct cuMemMapArrayAsync_ptsz_params_st {
+    CUarrayMapInfo *mapInfoList;
+    unsigned int count;
+    CUstream hStream;
+} cuMemMapArrayAsync_ptsz_params;
+
+typedef struct cuMemUnmap_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+} cuMemUnmap_params;
+
+typedef struct cuMemSetAccess_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+    const CUmemAccessDesc *desc;
+    size_t count;
+} cuMemSetAccess_params;
+
+typedef struct cuMemGetAccess_params_st {
+    unsigned long long *flags;
+    const CUmemLocation *location;
+    CUdeviceptr ptr;
+} cuMemGetAccess_params;
+
+typedef struct cuMemExportToShareableHandle_params_st {
+    void *shareableHandle;
+    CUmemGenericAllocationHandle handle;
+    CUmemAllocationHandleType handleType;
+    unsigned long long flags;
+} cuMemExportToShareableHandle_params;
+
+typedef struct cuMemImportFromShareableHandle_params_st {
+    CUmemGenericAllocationHandle *handle;
+    void *osHandle;
+    CUmemAllocationHandleType shHandleType;
+} cuMemImportFromShareableHandle_params;
+
+typedef struct cuMemGetAllocationGranularity_params_st {
+    size_t *granularity;
+    const CUmemAllocationProp *prop;
+    CUmemAllocationGranularity_flags option;
+} cuMemGetAllocationGranularity_params;
+
+typedef struct cuMemGetAllocationPropertiesFromHandle_params_st {
+    CUmemAllocationProp *prop;
+    CUmemGenericAllocationHandle handle;
+} cuMemGetAllocationPropertiesFromHandle_params;
+
+typedef struct cuMemRetainAllocationHandle_params_st {
+    CUmemGenericAllocationHandle *handle;
+    void *addr;
+} cuMemRetainAllocationHandle_params;
+
+typedef struct cuMemFreeAsync_ptsz_params_st {
+    CUdeviceptr dptr;
+    CUstream hStream;
+} cuMemFreeAsync_ptsz_params;
+
+typedef struct cuMemAllocAsync_ptsz_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUstream hStream;
+} cuMemAllocAsync_ptsz_params;
+
+typedef struct cuMemPoolTrimTo_params_st {
+    CUmemoryPool pool;
+    size_t minBytesToKeep;
+} cuMemPoolTrimTo_params;
+
+typedef struct cuMemPoolSetAttribute_params_st {
+    CUmemoryPool pool;
+    CUmemPool_attribute attr;
+    void *value;
+} cuMemPoolSetAttribute_params;
+
+typedef struct cuMemPoolGetAttribute_params_st {
+    CUmemoryPool pool;
+    CUmemPool_attribute attr;
+    void *value;
+} cuMemPoolGetAttribute_params;
+
+typedef struct cuMemPoolSetAccess_params_st {
+    CUmemoryPool pool;
+    const CUmemAccessDesc *map;
+    size_t count;
+} cuMemPoolSetAccess_params;
+
+typedef struct cuMemPoolGetAccess_params_st {
+    CUmemAccess_flags *flags;
+    CUmemoryPool memPool;
+    CUmemLocation *location;
+} cuMemPoolGetAccess_params;
+
+typedef struct cuMemPoolCreate_params_st {
+    CUmemoryPool *pool;
+    const CUmemPoolProps *poolProps;
+} cuMemPoolCreate_params;
+
+typedef struct cuMemPoolDestroy_params_st {
+    CUmemoryPool pool;
+} cuMemPoolDestroy_params;
+
+typedef struct cuMemAllocFromPoolAsync_ptsz_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUmemoryPool pool;
+    CUstream hStream;
+} cuMemAllocFromPoolAsync_ptsz_params;
+
+typedef struct cuMemPoolExportToShareableHandle_params_st {
+    void *handle_out;
+    CUmemoryPool pool;
+    CUmemAllocationHandleType handleType;
+    unsigned long long flags;
+} cuMemPoolExportToShareableHandle_params;
+
+typedef struct cuMemPoolImportFromShareableHandle_params_st {
+    CUmemoryPool *pool_out;
+    void *handle;
+    CUmemAllocationHandleType handleType;
+    unsigned long long flags;
+} cuMemPoolImportFromShareableHandle_params;
+
+typedef struct cuMemPoolExportPointer_params_st {
+    CUmemPoolPtrExportData *shareData_out;
+    CUdeviceptr ptr;
+} cuMemPoolExportPointer_params;
+
+typedef struct cuMemPoolImportPointer_params_st {
+    CUdeviceptr *ptr_out;
+    CUmemoryPool pool;
+    CUmemPoolPtrExportData *shareData;
+} cuMemPoolImportPointer_params;
+
+typedef struct cuMulticastCreate_params_st {
+    CUmemGenericAllocationHandle *mcHandle;
+    const CUmulticastObjectProp *prop;
+} cuMulticastCreate_params;
+
+typedef struct cuMulticastAddDevice_params_st {
+    CUmemGenericAllocationHandle mcHandle;
+    CUdevice dev;
+} cuMulticastAddDevice_params;
+
+typedef struct cuMulticastBindMem_params_st {
+    CUmemGenericAllocationHandle mcHandle;
+    size_t mcOffset;
+    CUmemGenericAllocationHandle memHandle;
+    size_t memOffset;
+    size_t size;
+    unsigned long long flags;
+} cuMulticastBindMem_params;
+
+typedef struct cuMulticastBindAddr_params_st {
+    CUmemGenericAllocationHandle mcHandle;
+    size_t mcOffset;
+    CUdeviceptr memptr;
+    size_t size;
+    unsigned long long flags;
+} cuMulticastBindAddr_params;
+
+typedef struct cuMulticastUnbind_params_st {
+    CUmemGenericAllocationHandle mcHandle;
+    CUdevice dev;
+    size_t mcOffset;
+    size_t size;
+} cuMulticastUnbind_params;
+
+typedef struct cuMulticastGetGranularity_params_st {
+    size_t *granularity;
+    const CUmulticastObjectProp *prop;
+    CUmulticastGranularity_flags option;
+} cuMulticastGetGranularity_params;
+
+typedef struct cuPointerGetAttribute_params_st {
+    void *data;
+    CUpointer_attribute attribute;
+    CUdeviceptr ptr;
+} cuPointerGetAttribute_params;
+
+typedef struct cuMemPrefetchAsync_ptsz_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUdevice dstDevice;
+    CUstream hStream;
+} cuMemPrefetchAsync_ptsz_params;
+
+typedef struct cuMemPrefetchAsync_v2_ptsz_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUmemLocation location;
+    unsigned int flags;
+    CUstream hStream;
+} cuMemPrefetchAsync_v2_ptsz_params;
+
+typedef struct cuMemAdvise_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUmem_advise advice;
+    CUdevice device;
+} cuMemAdvise_params;
+
+typedef struct cuMemAdvise_v2_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUmem_advise advice;
+    CUmemLocation location;
+} cuMemAdvise_v2_params;
+
+typedef struct cuMemRangeGetAttribute_params_st {
+    void *data;
+    size_t dataSize;
+    CUmem_range_attribute attribute;
+    CUdeviceptr devPtr;
+    size_t count;
+} cuMemRangeGetAttribute_params;
+
+typedef struct cuMemRangeGetAttributes_params_st {
+    void **data;
+    size_t *dataSizes;
+    CUmem_range_attribute *attributes;
+    size_t numAttributes;
+    CUdeviceptr devPtr;
+    size_t count;
+} cuMemRangeGetAttributes_params;
+
+typedef struct cuPointerSetAttribute_params_st {
+    const void *value;
+    CUpointer_attribute attribute;
+    CUdeviceptr ptr;
+} cuPointerSetAttribute_params;
+
+typedef struct cuPointerGetAttributes_params_st {
+    unsigned int numAttributes;
+    CUpointer_attribute *attributes;
+    void **data;
+    CUdeviceptr ptr;
+} cuPointerGetAttributes_params;
+
+typedef struct cuStreamCreate_params_st {
+    CUstream *phStream;
+    unsigned int Flags;
+} cuStreamCreate_params;
+
+typedef struct cuStreamCreateWithPriority_params_st {
+    CUstream *phStream;
+    unsigned int flags;
+    int priority;
+} cuStreamCreateWithPriority_params;
+
+typedef struct cuStreamGetPriority_ptsz_params_st {
+    CUstream hStream;
+    int *priority;
+} cuStreamGetPriority_ptsz_params;
+
+typedef struct cuStreamGetDevice_ptsz_params_st {
+    CUstream hStream;
+    CUdevice *device;
+} cuStreamGetDevice_ptsz_params;
+
+typedef struct cuStreamGetFlags_ptsz_params_st {
+    CUstream hStream;
+    unsigned int *flags;
+} cuStreamGetFlags_ptsz_params;
+
+typedef struct cuStreamGetId_ptsz_params_st {
+    CUstream hStream;
+    unsigned long long *streamId;
+} cuStreamGetId_ptsz_params;
+
+typedef struct cuStreamGetCtx_ptsz_params_st {
+    CUstream hStream;
+    CUcontext *pctx;
+} cuStreamGetCtx_ptsz_params;
+
+typedef struct cuStreamGetCtx_v2_ptsz_params_st {
+    CUstream hStream;
+    CUcontext *pCtx;
+    CUgreenCtx *pGreenCtx;
+} cuStreamGetCtx_v2_ptsz_params;
+
+typedef struct cuStreamWaitEvent_ptsz_params_st {
+    CUstream hStream;
+    CUevent hEvent;
+    unsigned int Flags;
+} cuStreamWaitEvent_ptsz_params;
+
+typedef struct cuStreamAddCallback_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCallback callback;
+    void *userData;
+    unsigned int flags;
+} cuStreamAddCallback_ptsz_params;
+
+typedef struct cuStreamBeginCapture_v2_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureMode mode;
+} cuStreamBeginCapture_v2_ptsz_params;
+
+typedef struct cuStreamBeginCaptureToGraph_ptsz_params_st {
+    CUstream hStream;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    const CUgraphEdgeData *dependencyData;
+    size_t numDependencies;
+    CUstreamCaptureMode mode;
+} cuStreamBeginCaptureToGraph_ptsz_params;
+
+typedef struct cuThreadExchangeStreamCaptureMode_params_st {
+    CUstreamCaptureMode *mode;
+} cuThreadExchangeStreamCaptureMode_params;
+
+typedef struct cuStreamEndCapture_ptsz_params_st {
+    CUstream hStream;
+    CUgraph *phGraph;
+} cuStreamEndCapture_ptsz_params;
+
+typedef struct cuStreamIsCapturing_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus;
+} cuStreamIsCapturing_ptsz_params;
+
+typedef struct cuStreamGetCaptureInfo_v2_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+    CUgraph *graph_out;
+    const CUgraphNode **dependencies_out;
+    size_t *numDependencies_out;
+} cuStreamGetCaptureInfo_v2_ptsz_params;
+
+typedef struct cuStreamGetCaptureInfo_v3_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+    CUgraph *graph_out;
+    const CUgraphNode **dependencies_out;
+    const CUgraphEdgeData **edgeData_out;
+    size_t *numDependencies_out;
+} cuStreamGetCaptureInfo_v3_ptsz_params;
+
+typedef struct cuStreamUpdateCaptureDependencies_ptsz_params_st {
+    CUstream hStream;
+    CUgraphNode *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cuStreamUpdateCaptureDependencies_ptsz_params;
+
+typedef struct cuStreamUpdateCaptureDependencies_v2_ptsz_params_st {
+    CUstream hStream;
+    CUgraphNode *dependencies;
+    const CUgraphEdgeData *dependencyData;
+    size_t numDependencies;
+    unsigned int flags;
+} cuStreamUpdateCaptureDependencies_v2_ptsz_params;
+
+typedef struct cuStreamAttachMemAsync_ptsz_params_st {
+    CUstream hStream;
+    CUdeviceptr dptr;
+    size_t length;
+    unsigned int flags;
+} cuStreamAttachMemAsync_ptsz_params;
+
+typedef struct cuStreamQuery_ptsz_params_st {
+    CUstream hStream;
+} cuStreamQuery_ptsz_params;
+
+typedef struct cuStreamSynchronize_ptsz_params_st {
+    CUstream hStream;
+} cuStreamSynchronize_ptsz_params;
+
+typedef struct cuStreamDestroy_v2_params_st {
+    CUstream hStream;
+} cuStreamDestroy_v2_params;
+
+typedef struct cuStreamCopyAttributes_ptsz_params_st {
+    CUstream dst;
+    CUstream src;
+} cuStreamCopyAttributes_ptsz_params;
+
+typedef struct cuStreamGetAttribute_ptsz_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    CUstreamAttrValue *value_out;
+} cuStreamGetAttribute_ptsz_params;
+
+typedef struct cuStreamSetAttribute_ptsz_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    const CUstreamAttrValue *value;
+} cuStreamSetAttribute_ptsz_params;
+
+typedef struct cuEventCreate_params_st {
+    CUevent *phEvent;
+    unsigned int Flags;
+} cuEventCreate_params;
+
+typedef struct cuEventRecord_ptsz_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+} cuEventRecord_ptsz_params;
+
+typedef struct cuEventRecordWithFlags_ptsz_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+    unsigned int flags;
+} cuEventRecordWithFlags_ptsz_params;
+
+typedef struct cuEventQuery_params_st {
+    CUevent hEvent;
+} cuEventQuery_params;
+
+typedef struct cuEventSynchronize_params_st {
+    CUevent hEvent;
+} cuEventSynchronize_params;
+
+typedef struct cuEventDestroy_v2_params_st {
+    CUevent hEvent;
+} cuEventDestroy_v2_params;
+
+typedef struct cuEventElapsedTime_params_st {
+    float *pMilliseconds;
+    CUevent hStart;
+    CUevent hEnd;
+} cuEventElapsedTime_params;
+
+typedef struct cuEventElapsedTime_v2_params_st {
+    float *pMilliseconds;
+    CUevent hStart;
+    CUevent hEnd;
+} cuEventElapsedTime_v2_params;
+
+typedef struct cuImportExternalMemory_params_st {
+    CUexternalMemory *extMem_out;
+    const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc;
+} cuImportExternalMemory_params;
+
+typedef struct cuExternalMemoryGetMappedBuffer_params_st {
+    CUdeviceptr *devPtr;
+    CUexternalMemory extMem;
+    const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc;
+} cuExternalMemoryGetMappedBuffer_params;
+
+typedef struct cuExternalMemoryGetMappedMipmappedArray_params_st {
+    CUmipmappedArray *mipmap;
+    CUexternalMemory extMem;
+    const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc;
+} cuExternalMemoryGetMappedMipmappedArray_params;
+
+typedef struct cuDestroyExternalMemory_params_st {
+    CUexternalMemory extMem;
+} cuDestroyExternalMemory_params;
+
+typedef struct cuImportExternalSemaphore_params_st {
+    CUexternalSemaphore *extSem_out;
+    const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc;
+} cuImportExternalSemaphore_params;
+
+typedef struct cuSignalExternalSemaphoresAsync_ptsz_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuSignalExternalSemaphoresAsync_ptsz_params;
+
+typedef struct cuWaitExternalSemaphoresAsync_ptsz_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuWaitExternalSemaphoresAsync_ptsz_params;
+
+typedef struct cuDestroyExternalSemaphore_params_st {
+    CUexternalSemaphore extSem;
+} cuDestroyExternalSemaphore_params;
+
+typedef struct cuStreamWaitValue32_v2_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWaitValue32_v2_ptsz_params;
+
+typedef struct cuStreamWaitValue64_v2_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWaitValue64_v2_ptsz_params;
+
+typedef struct cuStreamWriteValue32_v2_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWriteValue32_v2_ptsz_params;
+
+typedef struct cuStreamWriteValue64_v2_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWriteValue64_v2_ptsz_params;
+
+typedef struct cuStreamBatchMemOp_v2_ptsz_params_st {
+    CUstream stream;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} cuStreamBatchMemOp_v2_ptsz_params;
+
+typedef struct cuFuncGetAttribute_params_st {
+    int *pi;
+    CUfunction_attribute attrib;
+    CUfunction hfunc;
+} cuFuncGetAttribute_params;
+
+typedef struct cuFuncSetAttribute_params_st {
+    CUfunction hfunc;
+    CUfunction_attribute attrib;
+    int value;
+} cuFuncSetAttribute_params;
+
+typedef struct cuFuncSetCacheConfig_params_st {
+    CUfunction hfunc;
+    CUfunc_cache config;
+} cuFuncSetCacheConfig_params;
+
+typedef struct cuFuncGetModule_params_st {
+    CUmodule *hmod;
+    CUfunction hfunc;
+} cuFuncGetModule_params;
+
+typedef struct cuFuncGetName_params_st {
+    const char **name;
+    CUfunction hfunc;
+} cuFuncGetName_params;
+
+typedef struct cuFuncGetParamInfo_params_st {
+    CUfunction func;
+    size_t paramIndex;
+    size_t *paramOffset;
+    size_t *paramSize;
+} cuFuncGetParamInfo_params;
+
+typedef struct cuFuncIsLoaded_params_st {
+    CUfunctionLoadingState *state;
+    CUfunction function;
+} cuFuncIsLoaded_params;
+
+typedef struct cuFuncLoad_params_st {
+    CUfunction function;
+} cuFuncLoad_params;
+
+typedef struct cuLaunchKernel_ptsz_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernel_ptsz_params;
+
+typedef struct cuLaunchKernelEx_ptsz_params_st {
+    const CUlaunchConfig *config;
+    CUfunction f;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernelEx_ptsz_params;
+
+typedef struct cuLaunchCooperativeKernel_ptsz_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+} cuLaunchCooperativeKernel_ptsz_params;
+
+typedef struct cuLaunchCooperativeKernelMultiDevice_params_st {
+    CUDA_LAUNCH_PARAMS *launchParamsList;
+    unsigned int numDevices;
+    unsigned int flags;
+} cuLaunchCooperativeKernelMultiDevice_params;
+
+typedef struct cuLaunchHostFunc_ptsz_params_st {
+    CUstream hStream;
+    CUhostFn fn;
+    void *userData;
+} cuLaunchHostFunc_ptsz_params;
+
+typedef struct cuFuncSetBlockShape_params_st {
+    CUfunction hfunc;
+    int x;
+    int y;
+    int z;
+} cuFuncSetBlockShape_params;
+
+typedef struct cuFuncSetSharedSize_params_st {
+    CUfunction hfunc;
+    unsigned int bytes;
+} cuFuncSetSharedSize_params;
+
+typedef struct cuParamSetSize_params_st {
+    CUfunction hfunc;
+    unsigned int numbytes;
+} cuParamSetSize_params;
+
+typedef struct cuParamSeti_params_st {
+    CUfunction hfunc;
+    int offset;
+    unsigned int value;
+} cuParamSeti_params;
+
+typedef struct cuParamSetf_params_st {
+    CUfunction hfunc;
+    int offset;
+    float value;
+} cuParamSetf_params;
+
+typedef struct cuParamSetv_params_st {
+    CUfunction hfunc;
+    int offset;
+    void *ptr;
+    unsigned int numbytes;
+} cuParamSetv_params;
+
+typedef struct cuLaunch_params_st {
+    CUfunction f;
+} cuLaunch_params;
+
+typedef struct cuLaunchGrid_params_st {
+    CUfunction f;
+    int grid_width;
+    int grid_height;
+} cuLaunchGrid_params;
+
+typedef struct cuLaunchGridAsync_params_st {
+    CUfunction f;
+    int grid_width;
+    int grid_height;
+    CUstream hStream;
+} cuLaunchGridAsync_params;
+
+typedef struct cuParamSetTexRef_params_st {
+    CUfunction hfunc;
+    int texunit;
+    CUtexref hTexRef;
+} cuParamSetTexRef_params;
+
+typedef struct cuFuncSetSharedMemConfig_params_st {
+    CUfunction hfunc;
+    CUsharedconfig config;
+} cuFuncSetSharedMemConfig_params;
+
+typedef struct cuGraphCreate_params_st {
+    CUgraph *phGraph;
+    unsigned int flags;
+} cuGraphCreate_params;
+
+typedef struct cuGraphAddKernelNode_v2_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphAddKernelNode_v2_params;
+
+typedef struct cuGraphKernelNodeGetParams_v2_params_st {
+    CUgraphNode hNode;
+    CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphKernelNodeGetParams_v2_params;
+
+typedef struct cuGraphKernelNodeSetParams_v2_params_st {
+    CUgraphNode hNode;
+    const CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphKernelNodeSetParams_v2_params;
+
+typedef struct cuGraphAddMemcpyNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_MEMCPY3D *copyParams;
+    CUcontext ctx;
+} cuGraphAddMemcpyNode_params;
+
+typedef struct cuGraphMemcpyNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_MEMCPY3D *nodeParams;
+} cuGraphMemcpyNodeGetParams_params;
+
+typedef struct cuGraphMemcpyNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_MEMCPY3D *nodeParams;
+} cuGraphMemcpyNodeSetParams_params;
+
+typedef struct cuGraphAddMemsetNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_MEMSET_NODE_PARAMS *memsetParams;
+    CUcontext ctx;
+} cuGraphAddMemsetNode_params;
+
+typedef struct cuGraphMemsetNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_MEMSET_NODE_PARAMS *nodeParams;
+} cuGraphMemsetNodeGetParams_params;
+
+typedef struct cuGraphMemsetNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_MEMSET_NODE_PARAMS *nodeParams;
+} cuGraphMemsetNodeSetParams_params;
+
+typedef struct cuGraphAddHostNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphAddHostNode_params;
+
+typedef struct cuGraphHostNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphHostNodeGetParams_params;
+
+typedef struct cuGraphHostNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphHostNodeSetParams_params;
+
+typedef struct cuGraphAddChildGraphNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUgraph childGraph;
+} cuGraphAddChildGraphNode_params;
+
+typedef struct cuGraphChildGraphNodeGetGraph_params_st {
+    CUgraphNode hNode;
+    CUgraph *phGraph;
+} cuGraphChildGraphNodeGetGraph_params;
+
+typedef struct cuGraphAddEmptyNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+} cuGraphAddEmptyNode_params;
+
+typedef struct cuGraphAddEventRecordNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUevent event;
+} cuGraphAddEventRecordNode_params;
+
+typedef struct cuGraphEventRecordNodeGetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent *event_out;
+} cuGraphEventRecordNodeGetEvent_params;
+
+typedef struct cuGraphEventRecordNodeSetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphEventRecordNodeSetEvent_params;
+
+typedef struct cuGraphAddEventWaitNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUevent event;
+} cuGraphAddEventWaitNode_params;
+
+typedef struct cuGraphEventWaitNodeGetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent *event_out;
+} cuGraphEventWaitNodeGetEvent_params;
+
+typedef struct cuGraphEventWaitNodeSetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphEventWaitNodeSetEvent_params;
+
+typedef struct cuGraphAddExternalSemaphoresSignalNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams;
+} cuGraphAddExternalSemaphoresSignalNode_params;
+
+typedef struct cuGraphExternalSemaphoresSignalNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out;
+} cuGraphExternalSemaphoresSignalNodeGetParams_params;
+
+typedef struct cuGraphExternalSemaphoresSignalNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams;
+} cuGraphExternalSemaphoresSignalNodeSetParams_params;
+
+typedef struct cuGraphAddExternalSemaphoresWaitNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams;
+} cuGraphAddExternalSemaphoresWaitNode_params;
+
+typedef struct cuGraphExternalSemaphoresWaitNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out;
+} cuGraphExternalSemaphoresWaitNodeGetParams_params;
+
+typedef struct cuGraphExternalSemaphoresWaitNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams;
+} cuGraphExternalSemaphoresWaitNodeSetParams_params;
+
+typedef struct cuGraphAddBatchMemOpNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams;
+} cuGraphAddBatchMemOpNode_params;
+
+typedef struct cuGraphBatchMemOpNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out;
+} cuGraphBatchMemOpNodeGetParams_params;
+
+typedef struct cuGraphBatchMemOpNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams;
+} cuGraphBatchMemOpNodeSetParams_params;
+
+typedef struct cuGraphExecBatchMemOpNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams;
+} cuGraphExecBatchMemOpNodeSetParams_params;
+
+typedef struct cuGraphAddMemAllocNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams;
+} cuGraphAddMemAllocNode_params;
+
+typedef struct cuGraphMemAllocNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_MEM_ALLOC_NODE_PARAMS *params_out;
+} cuGraphMemAllocNodeGetParams_params;
+
+typedef struct cuGraphAddMemFreeNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUdeviceptr dptr;
+} cuGraphAddMemFreeNode_params;
+
+typedef struct cuGraphMemFreeNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUdeviceptr *dptr_out;
+} cuGraphMemFreeNodeGetParams_params;
+
+typedef struct cuDeviceGraphMemTrim_params_st {
+    CUdevice device;
+} cuDeviceGraphMemTrim_params;
+
+typedef struct cuDeviceGetGraphMemAttribute_params_st {
+    CUdevice device;
+    CUgraphMem_attribute attr;
+    void *value;
+} cuDeviceGetGraphMemAttribute_params;
+
+typedef struct cuDeviceSetGraphMemAttribute_params_st {
+    CUdevice device;
+    CUgraphMem_attribute attr;
+    void *value;
+} cuDeviceSetGraphMemAttribute_params;
+
+typedef struct cuGraphClone_params_st {
+    CUgraph *phGraphClone;
+    CUgraph originalGraph;
+} cuGraphClone_params;
+
+typedef struct cuGraphNodeFindInClone_params_st {
+    CUgraphNode *phNode;
+    CUgraphNode hOriginalNode;
+    CUgraph hClonedGraph;
+} cuGraphNodeFindInClone_params;
+
+typedef struct cuGraphNodeGetType_params_st {
+    CUgraphNode hNode;
+    CUgraphNodeType *type;
+} cuGraphNodeGetType_params;
+
+typedef struct cuGraphGetNodes_params_st {
+    CUgraph hGraph;
+    CUgraphNode *nodes;
+    size_t *numNodes;
+} cuGraphGetNodes_params;
+
+typedef struct cuGraphGetRootNodes_params_st {
+    CUgraph hGraph;
+    CUgraphNode *rootNodes;
+    size_t *numRootNodes;
+} cuGraphGetRootNodes_params;
+
+typedef struct cuGraphGetEdges_params_st {
+    CUgraph hGraph;
+    CUgraphNode *from;
+    CUgraphNode *to;
+    size_t *numEdges;
+} cuGraphGetEdges_params;
+
+typedef struct cuGraphGetEdges_v2_params_st {
+    CUgraph hGraph;
+    CUgraphNode *from;
+    CUgraphNode *to;
+    CUgraphEdgeData *edgeData;
+    size_t *numEdges;
+} cuGraphGetEdges_v2_params;
+
+typedef struct cuGraphNodeGetDependencies_params_st {
+    CUgraphNode hNode;
+    CUgraphNode *dependencies;
+    size_t *numDependencies;
+} cuGraphNodeGetDependencies_params;
+
+typedef struct cuGraphNodeGetDependencies_v2_params_st {
+    CUgraphNode hNode;
+    CUgraphNode *dependencies;
+    CUgraphEdgeData *edgeData;
+    size_t *numDependencies;
+} cuGraphNodeGetDependencies_v2_params;
+
+typedef struct cuGraphNodeGetDependentNodes_params_st {
+    CUgraphNode hNode;
+    CUgraphNode *dependentNodes;
+    size_t *numDependentNodes;
+} cuGraphNodeGetDependentNodes_params;
+
+typedef struct cuGraphNodeGetDependentNodes_v2_params_st {
+    CUgraphNode hNode;
+    CUgraphNode *dependentNodes;
+    CUgraphEdgeData *edgeData;
+    size_t *numDependentNodes;
+} cuGraphNodeGetDependentNodes_v2_params;
+
+typedef struct cuGraphAddDependencies_params_st {
+    CUgraph hGraph;
+    const CUgraphNode *from;
+    const CUgraphNode *to;
+    size_t numDependencies;
+} cuGraphAddDependencies_params;
+
+typedef struct cuGraphAddDependencies_v2_params_st {
+    CUgraph hGraph;
+    const CUgraphNode *from;
+    const CUgraphNode *to;
+    const CUgraphEdgeData *edgeData;
+    size_t numDependencies;
+} cuGraphAddDependencies_v2_params;
+
+typedef struct cuGraphRemoveDependencies_params_st {
+    CUgraph hGraph;
+    const CUgraphNode *from;
+    const CUgraphNode *to;
+    size_t numDependencies;
+} cuGraphRemoveDependencies_params;
+
+typedef struct cuGraphRemoveDependencies_v2_params_st {
+    CUgraph hGraph;
+    const CUgraphNode *from;
+    const CUgraphNode *to;
+    const CUgraphEdgeData *edgeData;
+    size_t numDependencies;
+} cuGraphRemoveDependencies_v2_params;
+
+typedef struct cuGraphDestroyNode_params_st {
+    CUgraphNode hNode;
+} cuGraphDestroyNode_params;
+
+typedef struct cuGraphInstantiateWithFlags_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    unsigned long long flags;
+} cuGraphInstantiateWithFlags_params;
+
+typedef struct cuGraphInstantiateWithParams_ptsz_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams;
+} cuGraphInstantiateWithParams_ptsz_params;
+
+typedef struct cuGraphExecGetFlags_params_st {
+    CUgraphExec hGraphExec;
+    cuuint64_t *flags;
+} cuGraphExecGetFlags_params;
+
+typedef struct cuGraphExecKernelNodeSetParams_v2_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphExecKernelNodeSetParams_v2_params;
+
+typedef struct cuGraphExecMemcpyNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_MEMCPY3D *copyParams;
+    CUcontext ctx;
+} cuGraphExecMemcpyNodeSetParams_params;
+
+typedef struct cuGraphExecMemsetNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_MEMSET_NODE_PARAMS *memsetParams;
+    CUcontext ctx;
+} cuGraphExecMemsetNodeSetParams_params;
+
+typedef struct cuGraphExecHostNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphExecHostNodeSetParams_params;
+
+typedef struct cuGraphExecChildGraphNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    CUgraph childGraph;
+} cuGraphExecChildGraphNodeSetParams_params;
+
+typedef struct cuGraphExecEventRecordNodeSetEvent_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphExecEventRecordNodeSetEvent_params;
+
+typedef struct cuGraphExecEventWaitNodeSetEvent_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphExecEventWaitNodeSetEvent_params;
+
+typedef struct cuGraphExecExternalSemaphoresSignalNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams;
+} cuGraphExecExternalSemaphoresSignalNodeSetParams_params;
+
+typedef struct cuGraphExecExternalSemaphoresWaitNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams;
+} cuGraphExecExternalSemaphoresWaitNodeSetParams_params;
+
+typedef struct cuGraphNodeSetEnabled_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    unsigned int isEnabled;
+} cuGraphNodeSetEnabled_params;
+
+typedef struct cuGraphNodeGetEnabled_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    unsigned int *isEnabled;
+} cuGraphNodeGetEnabled_params;
+
+typedef struct cuGraphUpload_ptsz_params_st {
+    CUgraphExec hGraphExec;
+    CUstream hStream;
+} cuGraphUpload_ptsz_params;
+
+typedef struct cuGraphLaunch_ptsz_params_st {
+    CUgraphExec hGraphExec;
+    CUstream hStream;
+} cuGraphLaunch_ptsz_params;
+
+typedef struct cuGraphExecDestroy_params_st {
+    CUgraphExec hGraphExec;
+} cuGraphExecDestroy_params;
+
+typedef struct cuGraphDestroy_params_st {
+    CUgraph hGraph;
+} cuGraphDestroy_params;
+
+typedef struct cuGraphExecUpdate_v2_params_st {
+    CUgraphExec hGraphExec;
+    CUgraph hGraph;
+    CUgraphExecUpdateResultInfo *resultInfo;
+} cuGraphExecUpdate_v2_params;
+
+typedef struct cuGraphKernelNodeCopyAttributes_params_st {
+    CUgraphNode dst;
+    CUgraphNode src;
+} cuGraphKernelNodeCopyAttributes_params;
+
+typedef struct cuGraphKernelNodeGetAttribute_params_st {
+    CUgraphNode hNode;
+    CUkernelNodeAttrID attr;
+    CUkernelNodeAttrValue *value_out;
+} cuGraphKernelNodeGetAttribute_params;
+
+typedef struct cuGraphKernelNodeSetAttribute_params_st {
+    CUgraphNode hNode;
+    CUkernelNodeAttrID attr;
+    const CUkernelNodeAttrValue *value;
+} cuGraphKernelNodeSetAttribute_params;
+
+typedef struct cuGraphDebugDotPrint_params_st {
+    CUgraph hGraph;
+    const char *path;
+    unsigned int flags;
+} cuGraphDebugDotPrint_params;
+
+typedef struct cuUserObjectCreate_params_st {
+    CUuserObject *object_out;
+    void *ptr;
+    CUhostFn destroy;
+    unsigned int initialRefcount;
+    unsigned int flags;
+} cuUserObjectCreate_params;
+
+typedef struct cuUserObjectRetain_params_st {
+    CUuserObject object;
+    unsigned int count;
+} cuUserObjectRetain_params;
+
+typedef struct cuUserObjectRelease_params_st {
+    CUuserObject object;
+    unsigned int count;
+} cuUserObjectRelease_params;
+
+typedef struct cuGraphRetainUserObject_params_st {
+    CUgraph graph;
+    CUuserObject object;
+    unsigned int count;
+    unsigned int flags;
+} cuGraphRetainUserObject_params;
+
+typedef struct cuGraphReleaseUserObject_params_st {
+    CUgraph graph;
+    CUuserObject object;
+    unsigned int count;
+} cuGraphReleaseUserObject_params;
+
+typedef struct cuGraphAddNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUgraphNodeParams *nodeParams;
+} cuGraphAddNode_params;
+
+typedef struct cuGraphAddNode_v2_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    const CUgraphEdgeData *dependencyData;
+    size_t numDependencies;
+    CUgraphNodeParams *nodeParams;
+} cuGraphAddNode_v2_params;
+
+typedef struct cuGraphNodeSetParams_params_st {
+    CUgraphNode hNode;
+    CUgraphNodeParams *nodeParams;
+} cuGraphNodeSetParams_params;
+
+typedef struct cuGraphExecNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    CUgraphNodeParams *nodeParams;
+} cuGraphExecNodeSetParams_params;
+
+typedef struct cuGraphConditionalHandleCreate_params_st {
+    CUgraphConditionalHandle *pHandle_out;
+    CUgraph hGraph;
+    CUcontext ctx;
+    unsigned int defaultLaunchValue;
+    unsigned int flags;
+} cuGraphConditionalHandleCreate_params;
+
+typedef struct cuOccupancyMaxActiveBlocksPerMultiprocessor_params_st {
+    int *numBlocks;
+    CUfunction func;
+    int blockSize;
+    size_t dynamicSMemSize;
+} cuOccupancyMaxActiveBlocksPerMultiprocessor_params;
+
+typedef struct cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_params_st {
+    int *numBlocks;
+    CUfunction func;
+    int blockSize;
+    size_t dynamicSMemSize;
+    unsigned int flags;
+} cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_params;
+
+typedef struct cuOccupancyMaxPotentialBlockSize_params_st {
+    int *minGridSize;
+    int *blockSize;
+    CUfunction func;
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize;
+    size_t dynamicSMemSize;
+    int blockSizeLimit;
+} cuOccupancyMaxPotentialBlockSize_params;
+
+typedef struct cuOccupancyMaxPotentialBlockSizeWithFlags_params_st {
+    int *minGridSize;
+    int *blockSize;
+    CUfunction func;
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize;
+    size_t dynamicSMemSize;
+    int blockSizeLimit;
+    unsigned int flags;
+} cuOccupancyMaxPotentialBlockSizeWithFlags_params;
+
+typedef struct cuOccupancyAvailableDynamicSMemPerBlock_params_st {
+    size_t *dynamicSmemSize;
+    CUfunction func;
+    int numBlocks;
+    int blockSize;
+} cuOccupancyAvailableDynamicSMemPerBlock_params;
+
+typedef struct cuOccupancyMaxPotentialClusterSize_params_st {
+    int *clusterSize;
+    CUfunction func;
+    const CUlaunchConfig *config;
+} cuOccupancyMaxPotentialClusterSize_params;
+
+typedef struct cuOccupancyMaxActiveClusters_params_st {
+    int *numClusters;
+    CUfunction func;
+    const CUlaunchConfig *config;
+} cuOccupancyMaxActiveClusters_params;
+
+typedef struct cuTexRefSetArray_params_st {
+    CUtexref hTexRef;
+    CUarray hArray;
+    unsigned int Flags;
+} cuTexRefSetArray_params;
+
+typedef struct cuTexRefSetMipmappedArray_params_st {
+    CUtexref hTexRef;
+    CUmipmappedArray hMipmappedArray;
+    unsigned int Flags;
+} cuTexRefSetMipmappedArray_params;
+
+typedef struct cuTexRefSetAddress_v2_params_st {
+    size_t *ByteOffset;
+    CUtexref hTexRef;
+    CUdeviceptr dptr;
+    size_t bytes;
+} cuTexRefSetAddress_v2_params;
+
+typedef struct cuTexRefSetAddress2D_v3_params_st {
+    CUtexref hTexRef;
+    const CUDA_ARRAY_DESCRIPTOR *desc;
+    CUdeviceptr dptr;
+    size_t Pitch;
+} cuTexRefSetAddress2D_v3_params;
+
+typedef struct cuTexRefSetFormat_params_st {
+    CUtexref hTexRef;
+    CUarray_format fmt;
+    int NumPackedComponents;
+} cuTexRefSetFormat_params;
+
+typedef struct cuTexRefSetAddressMode_params_st {
+    CUtexref hTexRef;
+    int dim;
+    CUaddress_mode am;
+} cuTexRefSetAddressMode_params;
+
+typedef struct cuTexRefSetFilterMode_params_st {
+    CUtexref hTexRef;
+    CUfilter_mode fm;
+} cuTexRefSetFilterMode_params;
+
+typedef struct cuTexRefSetMipmapFilterMode_params_st {
+    CUtexref hTexRef;
+    CUfilter_mode fm;
+} cuTexRefSetMipmapFilterMode_params;
+
+typedef struct cuTexRefSetMipmapLevelBias_params_st {
+    CUtexref hTexRef;
+    float bias;
+} cuTexRefSetMipmapLevelBias_params;
+
+typedef struct cuTexRefSetMipmapLevelClamp_params_st {
+    CUtexref hTexRef;
+    float minMipmapLevelClamp;
+    float maxMipmapLevelClamp;
+} cuTexRefSetMipmapLevelClamp_params;
+
+typedef struct cuTexRefSetMaxAnisotropy_params_st {
+    CUtexref hTexRef;
+    unsigned int maxAniso;
+} cuTexRefSetMaxAnisotropy_params;
+
+typedef struct cuTexRefSetBorderColor_params_st {
+    CUtexref hTexRef;
+    float *pBorderColor;
+} cuTexRefSetBorderColor_params;
+
+typedef struct cuTexRefSetFlags_params_st {
+    CUtexref hTexRef;
+    unsigned int Flags;
+} cuTexRefSetFlags_params;
+
+typedef struct cuTexRefGetAddress_v2_params_st {
+    CUdeviceptr *pdptr;
+    CUtexref hTexRef;
+} cuTexRefGetAddress_v2_params;
+
+typedef struct cuTexRefGetArray_params_st {
+    CUarray *phArray;
+    CUtexref hTexRef;
+} cuTexRefGetArray_params;
+
+typedef struct cuTexRefGetMipmappedArray_params_st {
+    CUmipmappedArray *phMipmappedArray;
+    CUtexref hTexRef;
+} cuTexRefGetMipmappedArray_params;
+
+typedef struct cuTexRefGetAddressMode_params_st {
+    CUaddress_mode *pam;
+    CUtexref hTexRef;
+    int dim;
+} cuTexRefGetAddressMode_params;
+
+typedef struct cuTexRefGetFilterMode_params_st {
+    CUfilter_mode *pfm;
+    CUtexref hTexRef;
+} cuTexRefGetFilterMode_params;
+
+typedef struct cuTexRefGetFormat_params_st {
+    CUarray_format *pFormat;
+    int *pNumChannels;
+    CUtexref hTexRef;
+} cuTexRefGetFormat_params;
+
+typedef struct cuTexRefGetMipmapFilterMode_params_st {
+    CUfilter_mode *pfm;
+    CUtexref hTexRef;
+} cuTexRefGetMipmapFilterMode_params;
+
+typedef struct cuTexRefGetMipmapLevelBias_params_st {
+    float *pbias;
+    CUtexref hTexRef;
+} cuTexRefGetMipmapLevelBias_params;
+
+typedef struct cuTexRefGetMipmapLevelClamp_params_st {
+    float *pminMipmapLevelClamp;
+    float *pmaxMipmapLevelClamp;
+    CUtexref hTexRef;
+} cuTexRefGetMipmapLevelClamp_params;
+
+typedef struct cuTexRefGetMaxAnisotropy_params_st {
+    int *pmaxAniso;
+    CUtexref hTexRef;
+} cuTexRefGetMaxAnisotropy_params;
+
+typedef struct cuTexRefGetBorderColor_params_st {
+    float *pBorderColor;
+    CUtexref hTexRef;
+} cuTexRefGetBorderColor_params;
+
+typedef struct cuTexRefGetFlags_params_st {
+    unsigned int *pFlags;
+    CUtexref hTexRef;
+} cuTexRefGetFlags_params;
+
+typedef struct cuTexRefCreate_params_st {
+    CUtexref *pTexRef;
+} cuTexRefCreate_params;
+
+typedef struct cuTexRefDestroy_params_st {
+    CUtexref hTexRef;
+} cuTexRefDestroy_params;
+
+typedef struct cuSurfRefSetArray_params_st {
+    CUsurfref hSurfRef;
+    CUarray hArray;
+    unsigned int Flags;
+} cuSurfRefSetArray_params;
+
+typedef struct cuSurfRefGetArray_params_st {
+    CUarray *phArray;
+    CUsurfref hSurfRef;
+} cuSurfRefGetArray_params;
+
+typedef struct cuTexObjectCreate_params_st {
+    CUtexObject *pTexObject;
+    const CUDA_RESOURCE_DESC *pResDesc;
+    const CUDA_TEXTURE_DESC *pTexDesc;
+    const CUDA_RESOURCE_VIEW_DESC *pResViewDesc;
+} cuTexObjectCreate_params;
+
+typedef struct cuTexObjectDestroy_params_st {
+    CUtexObject texObject;
+} cuTexObjectDestroy_params;
+
+typedef struct cuTexObjectGetResourceDesc_params_st {
+    CUDA_RESOURCE_DESC *pResDesc;
+    CUtexObject texObject;
+} cuTexObjectGetResourceDesc_params;
+
+typedef struct cuTexObjectGetTextureDesc_params_st {
+    CUDA_TEXTURE_DESC *pTexDesc;
+    CUtexObject texObject;
+} cuTexObjectGetTextureDesc_params;
+
+typedef struct cuTexObjectGetResourceViewDesc_params_st {
+    CUDA_RESOURCE_VIEW_DESC *pResViewDesc;
+    CUtexObject texObject;
+} cuTexObjectGetResourceViewDesc_params;
+
+typedef struct cuSurfObjectCreate_params_st {
+    CUsurfObject *pSurfObject;
+    const CUDA_RESOURCE_DESC *pResDesc;
+} cuSurfObjectCreate_params;
+
+typedef struct cuSurfObjectDestroy_params_st {
+    CUsurfObject surfObject;
+} cuSurfObjectDestroy_params;
+
+typedef struct cuSurfObjectGetResourceDesc_params_st {
+    CUDA_RESOURCE_DESC *pResDesc;
+    CUsurfObject surfObject;
+} cuSurfObjectGetResourceDesc_params;
+
+typedef struct cuTensorMapEncodeTiled_params_st {
+    CUtensorMap *tensorMap;
+    CUtensorMapDataType tensorDataType;
+    cuuint32_t tensorRank;
+    void *globalAddress;
+    const cuuint64_t *globalDim;
+    const cuuint64_t *globalStrides;
+    const cuuint32_t *boxDim;
+    const cuuint32_t *elementStrides;
+    CUtensorMapInterleave interleave;
+    CUtensorMapSwizzle swizzle;
+    CUtensorMapL2promotion l2Promotion;
+    CUtensorMapFloatOOBfill oobFill;
+} cuTensorMapEncodeTiled_params;
+
+typedef struct cuTensorMapEncodeIm2col_params_st {
+    CUtensorMap *tensorMap;
+    CUtensorMapDataType tensorDataType;
+    cuuint32_t tensorRank;
+    void *globalAddress;
+    const cuuint64_t *globalDim;
+    const cuuint64_t *globalStrides;
+    const int *pixelBoxLowerCorner;
+    const int *pixelBoxUpperCorner;
+    cuuint32_t channelsPerPixel;
+    cuuint32_t pixelsPerColumn;
+    const cuuint32_t *elementStrides;
+    CUtensorMapInterleave interleave;
+    CUtensorMapSwizzle swizzle;
+    CUtensorMapL2promotion l2Promotion;
+    CUtensorMapFloatOOBfill oobFill;
+} cuTensorMapEncodeIm2col_params;
+
+typedef struct cuTensorMapReplaceAddress_params_st {
+    CUtensorMap *tensorMap;
+    void *globalAddress;
+} cuTensorMapReplaceAddress_params;
+
+typedef struct cuDeviceCanAccessPeer_params_st {
+    int *canAccessPeer;
+    CUdevice dev;
+    CUdevice peerDev;
+} cuDeviceCanAccessPeer_params;
+
+typedef struct cuCtxEnablePeerAccess_params_st {
+    CUcontext peerContext;
+    unsigned int Flags;
+} cuCtxEnablePeerAccess_params;
+
+typedef struct cuCtxDisablePeerAccess_params_st {
+    CUcontext peerContext;
+} cuCtxDisablePeerAccess_params;
+
+typedef struct cuDeviceGetP2PAttribute_params_st {
+    int *value;
+    CUdevice_P2PAttribute attrib;
+    CUdevice srcDevice;
+    CUdevice dstDevice;
+} cuDeviceGetP2PAttribute_params;
+
+typedef struct cuGraphicsUnregisterResource_params_st {
+    CUgraphicsResource resource;
+} cuGraphicsUnregisterResource_params;
+
+typedef struct cuGraphicsSubResourceGetMappedArray_params_st {
+    CUarray *pArray;
+    CUgraphicsResource resource;
+    unsigned int arrayIndex;
+    unsigned int mipLevel;
+} cuGraphicsSubResourceGetMappedArray_params;
+
+typedef struct cuGraphicsResourceGetMappedMipmappedArray_params_st {
+    CUmipmappedArray *pMipmappedArray;
+    CUgraphicsResource resource;
+} cuGraphicsResourceGetMappedMipmappedArray_params;
+
+typedef struct cuGraphicsResourceGetMappedPointer_v2_params_st {
+    CUdeviceptr *pDevPtr;
+    size_t *pSize;
+    CUgraphicsResource resource;
+} cuGraphicsResourceGetMappedPointer_v2_params;
+
+typedef struct cuGraphicsResourceSetMapFlags_v2_params_st {
+    CUgraphicsResource resource;
+    unsigned int flags;
+} cuGraphicsResourceSetMapFlags_v2_params;
+
+typedef struct cuGraphicsMapResources_ptsz_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsMapResources_ptsz_params;
+
+typedef struct cuGraphicsUnmapResources_ptsz_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsUnmapResources_ptsz_params;
+
+typedef struct cuGetProcAddress_v2_params_st {
+    const char *symbol;
+    void **pfn;
+    int cudaVersion;
+    cuuint64_t flags;
+    CUdriverProcAddressQueryResult *symbolStatus;
+} cuGetProcAddress_v2_params;
+
+typedef struct cuCoredumpGetAttribute_params_st {
+    CUcoredumpSettings attrib;
+    void *value;
+    size_t *size;
+} cuCoredumpGetAttribute_params;
+
+typedef struct cuCoredumpGetAttributeGlobal_params_st {
+    CUcoredumpSettings attrib;
+    void *value;
+    size_t *size;
+} cuCoredumpGetAttributeGlobal_params;
+
+typedef struct cuCoredumpSetAttribute_params_st {
+    CUcoredumpSettings attrib;
+    void *value;
+    size_t *size;
+} cuCoredumpSetAttribute_params;
+
+typedef struct cuCoredumpSetAttributeGlobal_params_st {
+    CUcoredumpSettings attrib;
+    void *value;
+    size_t *size;
+} cuCoredumpSetAttributeGlobal_params;
+
+typedef struct cuGetExportTable_params_st {
+    const void **ppExportTable;
+    const CUuuid *pExportTableId;
+} cuGetExportTable_params;
+
+typedef struct cuGreenCtxCreate_params_st {
+    CUgreenCtx *phCtx;
+    CUdevResourceDesc desc;
+    CUdevice dev;
+    unsigned int flags;
+} cuGreenCtxCreate_params;
+
+typedef struct cuGreenCtxDestroy_params_st {
+    CUgreenCtx hCtx;
+} cuGreenCtxDestroy_params;
+
+typedef struct cuCtxFromGreenCtx_params_st {
+    CUcontext *pContext;
+    CUgreenCtx hCtx;
+} cuCtxFromGreenCtx_params;
+
+typedef struct cuDeviceGetDevResource_params_st {
+    CUdevice device;
+    CUdevResource *resource;
+    CUdevResourceType type;
+} cuDeviceGetDevResource_params;
+
+typedef struct cuCtxGetDevResource_params_st {
+    CUcontext hCtx;
+    CUdevResource *resource;
+    CUdevResourceType type;
+} cuCtxGetDevResource_params;
+
+typedef struct cuGreenCtxGetDevResource_params_st {
+    CUgreenCtx hCtx;
+    CUdevResource *resource;
+    CUdevResourceType type;
+} cuGreenCtxGetDevResource_params;
+
+typedef struct cuDevSmResourceSplitByCount_params_st {
+    CUdevResource *result;
+    unsigned int *nbGroups;
+    const CUdevResource *input;
+    CUdevResource *remaining;
+    unsigned int useFlags;
+    unsigned int minCount;
+} cuDevSmResourceSplitByCount_params;
+
+typedef struct cuDevResourceGenerateDesc_params_st {
+    CUdevResourceDesc *phDesc;
+    CUdevResource *resources;
+    unsigned int nbResources;
+} cuDevResourceGenerateDesc_params;
+
+typedef struct cuGreenCtxRecordEvent_params_st {
+    CUgreenCtx hCtx;
+    CUevent hEvent;
+} cuGreenCtxRecordEvent_params;
+
+typedef struct cuGreenCtxWaitEvent_params_st {
+    CUgreenCtx hCtx;
+    CUevent hEvent;
+} cuGreenCtxWaitEvent_params;
+
+typedef struct cuStreamGetGreenCtx_params_st {
+    CUstream hStream;
+    CUgreenCtx *phCtx;
+} cuStreamGetGreenCtx_params;
+
+typedef struct cuGreenCtxStreamCreate_params_st {
+    CUstream *phStream;
+    CUgreenCtx greenCtx;
+    unsigned int flags;
+    int priority;
+} cuGreenCtxStreamCreate_params;
+
+typedef struct cuMemHostRegister_params_st {
+    void *p;
+    size_t bytesize;
+    unsigned int Flags;
+} cuMemHostRegister_params;
+
+typedef struct cuGraphicsResourceSetMapFlags_params_st {
+    CUgraphicsResource resource;
+    unsigned int flags;
+} cuGraphicsResourceSetMapFlags_params;
+
+typedef struct cuLinkCreate_params_st {
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+    CUlinkState *stateOut;
+} cuLinkCreate_params;
+
+typedef struct cuLinkAddData_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    void *data;
+    size_t size;
+    const char *name;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddData_params;
+
+typedef struct cuLinkAddFile_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    const char *path;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddFile_params;
+
+typedef struct cuTexRefSetAddress2D_v2_params_st {
+    CUtexref hTexRef;
+    const CUDA_ARRAY_DESCRIPTOR *desc;
+    CUdeviceptr dptr;
+    size_t Pitch;
+} cuTexRefSetAddress2D_v2_params;
+
+typedef struct cuDeviceTotalMem_params_st {
+    unsigned int *bytes;
+    CUdevice dev;
+} cuDeviceTotalMem_params;
+
+typedef struct cuCtxCreate_params_st {
+    CUcontext *pctx;
+    unsigned int flags;
+    CUdevice dev;
+} cuCtxCreate_params;
+
+typedef struct cuModuleGetGlobal_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *bytes;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetGlobal_params;
+
+typedef struct cuMemGetInfo_params_st {
+    unsigned int *free;
+    unsigned int *total;
+} cuMemGetInfo_params;
+
+typedef struct cuMemAlloc_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int bytesize;
+} cuMemAlloc_params;
+
+typedef struct cuMemAllocPitch_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *pPitch;
+    unsigned int WidthInBytes;
+    unsigned int Height;
+    unsigned int ElementSizeBytes;
+} cuMemAllocPitch_params;
+
+typedef struct cuMemFree_params_st {
+    CUdeviceptr_v1 dptr;
+} cuMemFree_params;
+
+typedef struct cuMemGetAddressRange_params_st {
+    CUdeviceptr_v1 *pbase;
+    unsigned int *psize;
+    CUdeviceptr_v1 dptr;
+} cuMemGetAddressRange_params;
+
+typedef struct cuMemAllocHost_params_st {
+    void **pp;
+    unsigned int bytesize;
+} cuMemAllocHost_params;
+
+typedef struct cuMemHostGetDevicePointer_params_st {
+    CUdeviceptr_v1 *pdptr;
+    void *p;
+    unsigned int Flags;
+} cuMemHostGetDevicePointer_params;
+
+typedef struct cuMemcpyHtoD_params_st {
+    CUdeviceptr_v1 dstDevice;
+    const void *srcHost;
+    unsigned int ByteCount;
+} cuMemcpyHtoD_params;
+
+typedef struct cuMemcpyDtoH_params_st {
+    void *dstHost;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+} cuMemcpyDtoH_params;
+
+typedef struct cuMemcpyDtoD_params_st {
+    CUdeviceptr_v1 dstDevice;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+} cuMemcpyDtoD_params;
+
+typedef struct cuMemcpyDtoA_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+} cuMemcpyDtoA_params;
+
+typedef struct cuMemcpyAtoD_params_st {
+    CUdeviceptr_v1 dstDevice;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+} cuMemcpyAtoD_params;
+
+typedef struct cuMemcpyHtoA_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    const void *srcHost;
+    unsigned int ByteCount;
+} cuMemcpyHtoA_params;
+
+typedef struct cuMemcpyAtoH_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+} cuMemcpyAtoH_params;
+
+typedef struct cuMemcpyAtoA_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+} cuMemcpyAtoA_params;
+
+typedef struct cuMemcpyHtoAAsync_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    const void *srcHost;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoAAsync_params;
+
+typedef struct cuMemcpyAtoHAsync_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyAtoHAsync_params;
+
+typedef struct cuMemcpy2D_params_st {
+    const CUDA_MEMCPY2D_v1 *pCopy;
+} cuMemcpy2D_params;
+
+typedef struct cuMemcpy2DUnaligned_params_st {
+    const CUDA_MEMCPY2D_v1 *pCopy;
+} cuMemcpy2DUnaligned_params;
+
+typedef struct cuMemcpy3D_params_st {
+    const CUDA_MEMCPY3D_v1 *pCopy;
+} cuMemcpy3D_params;
+
+typedef struct cuMemcpyHtoDAsync_params_st {
+    CUdeviceptr_v1 dstDevice;
+    const void *srcHost;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoDAsync_params;
+
+typedef struct cuMemcpyDtoHAsync_params_st {
+    void *dstHost;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoHAsync_params;
+
+typedef struct cuMemcpyDtoDAsync_params_st {
+    CUdeviceptr_v1 dstDevice;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoDAsync_params;
+
+typedef struct cuMemcpy2DAsync_params_st {
+    const CUDA_MEMCPY2D_v1 *pCopy;
+    CUstream hStream;
+} cuMemcpy2DAsync_params;
+
+typedef struct cuMemcpy3DAsync_params_st {
+    const CUDA_MEMCPY3D_v1 *pCopy;
+    CUstream hStream;
+} cuMemcpy3DAsync_params;
+
+typedef struct cuMemsetD8_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned char uc;
+    unsigned int N;
+} cuMemsetD8_params;
+
+typedef struct cuMemsetD16_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned short us;
+    unsigned int N;
+} cuMemsetD16_params;
+
+typedef struct cuMemsetD32_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int ui;
+    unsigned int N;
+} cuMemsetD32_params;
+
+typedef struct cuMemsetD2D8_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int dstPitch;
+    unsigned char uc;
+    unsigned int Width;
+    unsigned int Height;
+} cuMemsetD2D8_params;
+
+typedef struct cuMemsetD2D16_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int dstPitch;
+    unsigned short us;
+    unsigned int Width;
+    unsigned int Height;
+} cuMemsetD2D16_params;
+
+typedef struct cuMemsetD2D32_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int dstPitch;
+    unsigned int ui;
+    unsigned int Width;
+    unsigned int Height;
+} cuMemsetD2D32_params;
+
+typedef struct cuArrayCreate_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray;
+} cuArrayCreate_params;
+
+typedef struct cuArrayGetDescriptor_params_st {
+    CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor;
+    CUarray hArray;
+} cuArrayGetDescriptor_params;
+
+typedef struct cuArray3DCreate_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray;
+} cuArray3DCreate_params;
+
+typedef struct cuArray3DGetDescriptor_params_st {
+    CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor;
+    CUarray hArray;
+} cuArray3DGetDescriptor_params;
+
+typedef struct cuTexRefSetAddress_params_st {
+    unsigned int *ByteOffset;
+    CUtexref hTexRef;
+    CUdeviceptr_v1 dptr;
+    unsigned int bytes;
+} cuTexRefSetAddress_params;
+
+typedef struct cuTexRefSetAddress2D_params_st {
+    CUtexref hTexRef;
+    const CUDA_ARRAY_DESCRIPTOR_v1 *desc;
+    CUdeviceptr_v1 dptr;
+    unsigned int Pitch;
+} cuTexRefSetAddress2D_params;
+
+typedef struct cuTexRefGetAddress_params_st {
+    CUdeviceptr_v1 *pdptr;
+    CUtexref hTexRef;
+} cuTexRefGetAddress_params;
+
+typedef struct cuGraphicsResourceGetMappedPointer_params_st {
+    CUdeviceptr_v1 *pDevPtr;
+    unsigned int *pSize;
+    CUgraphicsResource resource;
+} cuGraphicsResourceGetMappedPointer_params;
+
+typedef struct cuCtxDestroy_params_st {
+    CUcontext ctx;
+} cuCtxDestroy_params;
+
+typedef struct cuCtxPopCurrent_params_st {
+    CUcontext *pctx;
+} cuCtxPopCurrent_params;
+
+typedef struct cuCtxPushCurrent_params_st {
+    CUcontext ctx;
+} cuCtxPushCurrent_params;
+
+typedef struct cuStreamDestroy_params_st {
+    CUstream hStream;
+} cuStreamDestroy_params;
+
+typedef struct cuEventDestroy_params_st {
+    CUevent hEvent;
+} cuEventDestroy_params;
+
+typedef struct cuDevicePrimaryCtxRelease_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxRelease_params;
+
+typedef struct cuDevicePrimaryCtxReset_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxReset_params;
+
+typedef struct cuDevicePrimaryCtxSetFlags_params_st {
+    CUdevice dev;
+    unsigned int flags;
+} cuDevicePrimaryCtxSetFlags_params;
+
+typedef struct cuMemcpyHtoD_v2_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoD_v2_params;
+
+typedef struct cuMemcpyDtoH_v2_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoH_v2_params;
+
+typedef struct cuMemcpyDtoD_v2_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoD_v2_params;
+
+typedef struct cuMemcpyDtoA_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoA_v2_params;
+
+typedef struct cuMemcpyAtoD_v2_params_st {
+    CUdeviceptr dstDevice;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoD_v2_params;
+
+typedef struct cuMemcpyHtoA_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoA_v2_params;
+
+typedef struct cuMemcpyAtoH_v2_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoH_v2_params;
+
+typedef struct cuMemcpyAtoA_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoA_v2_params;
+
+typedef struct cuMemcpyHtoAAsync_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoAAsync_v2_params;
+
+typedef struct cuMemcpyAtoHAsync_v2_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAtoHAsync_v2_params;
+
+typedef struct cuMemcpy2D_v2_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2D_v2_params;
+
+typedef struct cuMemcpy2DUnaligned_v2_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2DUnaligned_v2_params;
+
+typedef struct cuMemcpy3D_v2_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+} cuMemcpy3D_v2_params;
+
+typedef struct cuMemcpyHtoDAsync_v2_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoDAsync_v2_params;
+
+typedef struct cuMemcpyDtoHAsync_v2_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoHAsync_v2_params;
+
+typedef struct cuMemcpyDtoDAsync_v2_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoDAsync_v2_params;
+
+typedef struct cuMemcpy2DAsync_v2_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+    CUstream hStream;
+} cuMemcpy2DAsync_v2_params;
+
+typedef struct cuMemcpy3DAsync_v2_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+    CUstream hStream;
+} cuMemcpy3DAsync_v2_params;
+
+typedef struct cuMemsetD8_v2_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+} cuMemsetD8_v2_params;
+
+typedef struct cuMemsetD16_v2_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+} cuMemsetD16_v2_params;
+
+typedef struct cuMemsetD32_v2_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+} cuMemsetD32_v2_params;
+
+typedef struct cuMemsetD2D8_v2_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D8_v2_params;
+
+typedef struct cuMemsetD2D16_v2_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D16_v2_params;
+
+typedef struct cuMemsetD2D32_v2_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D32_v2_params;
+
+typedef struct cuMemcpy_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+} cuMemcpy_params;
+
+typedef struct cuMemcpyAsync_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAsync_params;
+
+typedef struct cuMemcpyPeer_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+} cuMemcpyPeer_params;
+
+typedef struct cuMemcpyPeerAsync_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyPeerAsync_params;
+
+typedef struct cuMemcpy3DPeer_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+} cuMemcpy3DPeer_params;
+
+typedef struct cuMemcpy3DPeerAsync_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+    CUstream hStream;
+} cuMemcpy3DPeerAsync_params;
+
+typedef struct cuMemcpyBatchAsync_params_st {
+    CUdeviceptr *dsts;
+    CUdeviceptr *srcs;
+    size_t *sizes;
+    size_t count;
+    CUmemcpyAttributes *attrs;
+    size_t *attrsIdxs;
+    size_t numAttrs;
+    size_t *failIdx;
+    CUstream hStream;
+} cuMemcpyBatchAsync_params;
+
+typedef struct cuMemcpy3DBatchAsync_params_st {
+    size_t numOps;
+    CUDA_MEMCPY3D_BATCH_OP *opList;
+    size_t *failIdx;
+    unsigned long long flags;
+    CUstream hStream;
+} cuMemcpy3DBatchAsync_params;
+
+typedef struct cuMemsetD8Async_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD8Async_params;
+
+typedef struct cuMemsetD16Async_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD16Async_params;
+
+typedef struct cuMemsetD32Async_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD32Async_params;
+
+typedef struct cuMemsetD2D8Async_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D8Async_params;
+
+typedef struct cuMemsetD2D16Async_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D16Async_params;
+
+typedef struct cuMemsetD2D32Async_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D32Async_params;
+
+typedef struct cuStreamGetPriority_params_st {
+    CUstream hStream;
+    int *priority;
+} cuStreamGetPriority_params;
+
+typedef struct cuStreamGetId_params_st {
+    CUstream hStream;
+    unsigned long long *streamId;
+} cuStreamGetId_params;
+
+typedef struct cuStreamGetFlags_params_st {
+    CUstream hStream;
+    unsigned int *flags;
+} cuStreamGetFlags_params;
+
+typedef struct cuStreamGetDevice_params_st {
+    CUstream hStream;
+    CUdevice *device;
+} cuStreamGetDevice_params;
+
+typedef struct cuStreamGetCtx_params_st {
+    CUstream hStream;
+    CUcontext *pctx;
+} cuStreamGetCtx_params;
+
+typedef struct cuStreamGetCtx_v2_params_st {
+    CUstream hStream;
+    CUcontext *pCtx;
+    CUgreenCtx *pGreenCtx;
+} cuStreamGetCtx_v2_params;
+
+typedef struct cuStreamWaitEvent_params_st {
+    CUstream hStream;
+    CUevent hEvent;
+    unsigned int Flags;
+} cuStreamWaitEvent_params;
+
+typedef struct cuStreamAddCallback_params_st {
+    CUstream hStream;
+    CUstreamCallback callback;
+    void *userData;
+    unsigned int flags;
+} cuStreamAddCallback_params;
+
+typedef struct cuStreamAttachMemAsync_params_st {
+    CUstream hStream;
+    CUdeviceptr dptr;
+    size_t length;
+    unsigned int flags;
+} cuStreamAttachMemAsync_params;
+
+typedef struct cuStreamQuery_params_st {
+    CUstream hStream;
+} cuStreamQuery_params;
+
+typedef struct cuStreamSynchronize_params_st {
+    CUstream hStream;
+} cuStreamSynchronize_params;
+
+typedef struct cuEventRecord_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+} cuEventRecord_params;
+
+typedef struct cuEventRecordWithFlags_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+    unsigned int flags;
+} cuEventRecordWithFlags_params;
+
+typedef struct cuLaunchKernel_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernel_params;
+
+typedef struct cuLaunchKernelEx_params_st {
+    const CUlaunchConfig *config;
+    CUfunction f;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernelEx_params;
+
+typedef struct cuLaunchHostFunc_params_st {
+    CUstream hStream;
+    CUhostFn fn;
+    void *userData;
+} cuLaunchHostFunc_params;
+
+typedef struct cuGraphicsMapResources_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsMapResources_params;
+
+typedef struct cuGraphicsUnmapResources_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsUnmapResources_params;
+
+typedef struct cuStreamWriteValue32_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWriteValue32_params;
+
+typedef struct cuStreamWaitValue32_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWaitValue32_params;
+
+typedef struct cuStreamWriteValue64_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWriteValue64_params;
+
+typedef struct cuStreamWaitValue64_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWaitValue64_params;
+
+typedef struct cuStreamBatchMemOp_params_st {
+    CUstream stream;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} cuStreamBatchMemOp_params;
+
+typedef struct cuStreamWriteValue32_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWriteValue32_ptsz_params;
+
+typedef struct cuStreamWaitValue32_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWaitValue32_ptsz_params;
+
+typedef struct cuStreamWriteValue64_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWriteValue64_ptsz_params;
+
+typedef struct cuStreamWaitValue64_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWaitValue64_ptsz_params;
+
+typedef struct cuStreamBatchMemOp_ptsz_params_st {
+    CUstream stream;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} cuStreamBatchMemOp_ptsz_params;
+
+typedef struct cuStreamWriteValue32_v2_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWriteValue32_v2_params;
+
+typedef struct cuStreamWaitValue32_v2_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWaitValue32_v2_params;
+
+typedef struct cuStreamWriteValue64_v2_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWriteValue64_v2_params;
+
+typedef struct cuStreamWaitValue64_v2_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWaitValue64_v2_params;
+
+typedef struct cuStreamBatchMemOp_v2_params_st {
+    CUstream stream;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} cuStreamBatchMemOp_v2_params;
+
+typedef struct cuMemPrefetchAsync_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUdevice dstDevice;
+    CUstream hStream;
+} cuMemPrefetchAsync_params;
+
+typedef struct cuMemPrefetchAsync_v2_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUmemLocation location;
+    unsigned int flags;
+    CUstream hStream;
+} cuMemPrefetchAsync_v2_params;
+
+typedef struct cuLaunchCooperativeKernel_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+} cuLaunchCooperativeKernel_params;
+
+typedef struct cuSignalExternalSemaphoresAsync_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuSignalExternalSemaphoresAsync_params;
+
+typedef struct cuWaitExternalSemaphoresAsync_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuWaitExternalSemaphoresAsync_params;
+
+typedef struct cuStreamBeginCapture_params_st {
+    CUstream hStream;
+} cuStreamBeginCapture_params;
+
+typedef struct cuStreamBeginCapture_ptsz_params_st {
+    CUstream hStream;
+} cuStreamBeginCapture_ptsz_params;
+
+typedef struct cuStreamBeginCapture_v2_params_st {
+    CUstream hStream;
+    CUstreamCaptureMode mode;
+} cuStreamBeginCapture_v2_params;
+
+typedef struct cuStreamBeginCaptureToGraph_params_st {
+    CUstream hStream;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    const CUgraphEdgeData *dependencyData;
+    size_t numDependencies;
+    CUstreamCaptureMode mode;
+} cuStreamBeginCaptureToGraph_params;
+
+typedef struct cuStreamEndCapture_params_st {
+    CUstream hStream;
+    CUgraph *phGraph;
+} cuStreamEndCapture_params;
+
+typedef struct cuStreamIsCapturing_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus;
+} cuStreamIsCapturing_params;
+
+typedef struct cuStreamGetCaptureInfo_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+} cuStreamGetCaptureInfo_params;
+
+typedef struct cuStreamGetCaptureInfo_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+} cuStreamGetCaptureInfo_ptsz_params;
+
+typedef struct cuStreamGetCaptureInfo_v2_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+    CUgraph *graph_out;
+    const CUgraphNode **dependencies_out;
+    size_t *numDependencies_out;
+} cuStreamGetCaptureInfo_v2_params;
+
+typedef struct cuStreamGetCaptureInfo_v3_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+    CUgraph *graph_out;
+    const CUgraphNode **dependencies_out;
+    const CUgraphEdgeData **edgeData_out;
+    size_t *numDependencies_out;
+} cuStreamGetCaptureInfo_v3_params;
+
+typedef struct cuGraphAddKernelNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams;
+} cuGraphAddKernelNode_params;
+
+typedef struct cuGraphKernelNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams;
+} cuGraphKernelNodeGetParams_params;
+
+typedef struct cuGraphKernelNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams;
+} cuGraphKernelNodeSetParams_params;
+
+typedef struct cuGraphExecKernelNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams;
+} cuGraphExecKernelNodeSetParams_params;
+
+typedef struct cuGraphInstantiateWithParams_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams;
+} cuGraphInstantiateWithParams_params;
+
+typedef struct cuGraphExecUpdate_params_st {
+    CUgraphExec hGraphExec;
+    CUgraph hGraph;
+    CUgraphNode *hErrorNode_out;
+    CUgraphExecUpdateResult *updateResult_out;
+} cuGraphExecUpdate_params;
+
+typedef struct cuGraphUpload_params_st {
+    CUgraphExec hGraph;
+    CUstream hStream;
+} cuGraphUpload_params;
+
+typedef struct cuGraphLaunch_params_st {
+    CUgraphExec hGraph;
+    CUstream hStream;
+} cuGraphLaunch_params;
+
+typedef struct cuStreamCopyAttributes_params_st {
+    CUstream dstStream;
+    CUstream srcStream;
+} cuStreamCopyAttributes_params;
+
+typedef struct cuStreamGetAttribute_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    CUstreamAttrValue *value;
+} cuStreamGetAttribute_params;
+
+typedef struct cuStreamSetAttribute_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    const CUstreamAttrValue *param;
+} cuStreamSetAttribute_params;
+
+typedef struct cuIpcOpenMemHandle_params_st {
+    CUdeviceptr *pdptr;
+    CUipcMemHandle handle;
+    unsigned int Flags;
+} cuIpcOpenMemHandle_params;
+
+typedef struct cuGraphInstantiate_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    CUgraphNode *phErrorNode;
+    char *logBuffer;
+    size_t bufferSize;
+} cuGraphInstantiate_params;
+
+typedef struct cuGraphInstantiate_v2_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    CUgraphNode *phErrorNode;
+    char *logBuffer;
+    size_t bufferSize;
+} cuGraphInstantiate_v2_params;
+
+typedef struct cuMemMapArrayAsync_params_st {
+    CUarrayMapInfo *mapInfoList;
+    unsigned int count;
+    CUstream hStream;
+} cuMemMapArrayAsync_params;
+
+typedef struct cuMemFreeAsync_params_st {
+    CUdeviceptr dptr;
+    CUstream hStream;
+} cuMemFreeAsync_params;
+
+typedef struct cuMemAllocAsync_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUstream hStream;
+} cuMemAllocAsync_params;
+
+typedef struct cuMemAllocFromPoolAsync_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUmemoryPool pool;
+    CUstream hStream;
+} cuMemAllocFromPoolAsync_params;
+
+typedef struct cuStreamUpdateCaptureDependencies_params_st {
+    CUstream hStream;
+    CUgraphNode *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cuStreamUpdateCaptureDependencies_params;
+
+typedef struct cuStreamUpdateCaptureDependencies_v2_params_st {
+    CUstream hStream;
+    CUgraphNode *dependencies;
+    const CUgraphEdgeData *dependencyData;
+    size_t numDependencies;
+    unsigned int flags;
+} cuStreamUpdateCaptureDependencies_v2_params;
+
+typedef struct cuMemBatchDecompressAsync_params_st {
+    CUmemDecompressParams *paramsArray;
+    size_t count;
+    unsigned int flags;
+    size_t *errorIndex;
+    CUstream stream;
+} cuMemBatchDecompressAsync_params;
+
+typedef struct cuGetProcAddress_params_st {
+    const char *symbol;
+    void **pfn;
+    int cudaVersion;
+    cuuint64_t flags;
+} cuGetProcAddress_params;
+
+typedef struct cuCheckpointProcessGetRestoreThreadId_params_st {
+    int pid;
+    int *tid;
+} cuCheckpointProcessGetRestoreThreadId_params;
+
+typedef struct cuCheckpointProcessGetState_params_st {
+    int pid;
+    CUprocessState *state;
+} cuCheckpointProcessGetState_params;
+
+typedef struct cuCheckpointProcessLock_params_st {
+    int pid;
+    CUcheckpointLockArgs *args;
+} cuCheckpointProcessLock_params;
+
+typedef struct cuCheckpointProcessCheckpoint_params_st {
+    int pid;
+    CUcheckpointCheckpointArgs *args;
+} cuCheckpointProcessCheckpoint_params;
+
+typedef struct cuCheckpointProcessRestore_params_st {
+    int pid;
+    CUcheckpointRestoreArgs *args;
+} cuCheckpointProcessRestore_params;
+
+typedef struct cuCheckpointProcessUnlock_params_st {
+    int pid;
+    CUcheckpointUnlockArgs *args;
+} cuCheckpointProcessUnlock_params;
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_runtime_api_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_runtime_api_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..52321905dd0a82e550332f5d67b03fd4612860e7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_runtime_api_meta.h
@@ -0,0 +1,2372 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// CUDA public interface, for type definitions and api function prototypes
+#include "cuda_runtime_api.h"
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+// Currently used parameter trace structures
+typedef struct cudaDeviceSetLimit_v3020_params_st {
+    enum cudaLimit limit;
+    size_t value;
+} cudaDeviceSetLimit_v3020_params;
+
+typedef struct cudaDeviceGetLimit_v3020_params_st {
+    size_t *pValue;
+    enum cudaLimit limit;
+} cudaDeviceGetLimit_v3020_params;
+
+typedef struct cudaDeviceGetTexture1DLinearMaxWidth_v11010_params_st {
+    size_t *maxWidthInElements;
+    const struct cudaChannelFormatDesc *fmtDesc;
+    int device;
+} cudaDeviceGetTexture1DLinearMaxWidth_v11010_params;
+
+typedef struct cudaDeviceGetCacheConfig_v3020_params_st {
+    enum cudaFuncCache *pCacheConfig;
+} cudaDeviceGetCacheConfig_v3020_params;
+
+typedef struct cudaDeviceGetStreamPriorityRange_v5050_params_st {
+    int *leastPriority;
+    int *greatestPriority;
+} cudaDeviceGetStreamPriorityRange_v5050_params;
+
+typedef struct cudaDeviceSetCacheConfig_v3020_params_st {
+    enum cudaFuncCache cacheConfig;
+} cudaDeviceSetCacheConfig_v3020_params;
+
+typedef struct cudaDeviceGetByPCIBusId_v4010_params_st {
+    int *device;
+    const char *pciBusId;
+} cudaDeviceGetByPCIBusId_v4010_params;
+
+typedef struct cudaDeviceGetPCIBusId_v4010_params_st {
+    char *pciBusId;
+    int len;
+    int device;
+} cudaDeviceGetPCIBusId_v4010_params;
+
+typedef struct cudaIpcGetEventHandle_v4010_params_st {
+    cudaIpcEventHandle_t *handle;
+    cudaEvent_t event;
+} cudaIpcGetEventHandle_v4010_params;
+
+typedef struct cudaIpcOpenEventHandle_v4010_params_st {
+    cudaEvent_t *event;
+    cudaIpcEventHandle_t handle;
+} cudaIpcOpenEventHandle_v4010_params;
+
+typedef struct cudaIpcGetMemHandle_v4010_params_st {
+    cudaIpcMemHandle_t *handle;
+    void *devPtr;
+} cudaIpcGetMemHandle_v4010_params;
+
+typedef struct cudaIpcOpenMemHandle_v4010_params_st {
+    void **devPtr;
+    cudaIpcMemHandle_t handle;
+    unsigned int flags;
+} cudaIpcOpenMemHandle_v4010_params;
+
+typedef struct cudaIpcCloseMemHandle_v4010_params_st {
+    void *devPtr;
+} cudaIpcCloseMemHandle_v4010_params;
+
+typedef struct cudaDeviceFlushGPUDirectRDMAWrites_v11030_params_st {
+    enum cudaFlushGPUDirectRDMAWritesTarget target;
+    enum cudaFlushGPUDirectRDMAWritesScope scope;
+} cudaDeviceFlushGPUDirectRDMAWrites_v11030_params;
+
+typedef struct cudaDeviceRegisterAsyncNotification_v12040_params_st {
+    int device;
+    cudaAsyncCallback callbackFunc;
+    void *userData;
+    cudaAsyncCallbackHandle_t *callback;
+} cudaDeviceRegisterAsyncNotification_v12040_params;
+
+typedef struct cudaDeviceUnregisterAsyncNotification_v12040_params_st {
+    int device;
+    cudaAsyncCallbackHandle_t callback;
+} cudaDeviceUnregisterAsyncNotification_v12040_params;
+
+typedef struct cudaDeviceGetSharedMemConfig_v4020_params_st {
+    enum cudaSharedMemConfig *pConfig;
+} cudaDeviceGetSharedMemConfig_v4020_params;
+
+typedef struct cudaDeviceSetSharedMemConfig_v4020_params_st {
+    enum cudaSharedMemConfig config;
+} cudaDeviceSetSharedMemConfig_v4020_params;
+
+typedef struct cudaGetErrorName_v6050_params_st {
+    cudaError_t error;
+} cudaGetErrorName_v6050_params;
+
+typedef struct cudaGetErrorString_v3020_params_st {
+    cudaError_t error;
+} cudaGetErrorString_v3020_params;
+
+typedef struct cudaGetDeviceCount_v3020_params_st {
+    int *count;
+} cudaGetDeviceCount_v3020_params;
+
+typedef struct cudaGetDeviceProperties_v2_v12000_params_st {
+    struct cudaDeviceProp *prop;
+    int device;
+} cudaGetDeviceProperties_v2_v12000_params;
+
+typedef struct cudaDeviceGetAttribute_v5000_params_st {
+    int *value;
+    enum cudaDeviceAttr attr;
+    int device;
+} cudaDeviceGetAttribute_v5000_params;
+
+typedef struct cudaDeviceGetDefaultMemPool_v11020_params_st {
+    cudaMemPool_t *memPool;
+    int device;
+} cudaDeviceGetDefaultMemPool_v11020_params;
+
+typedef struct cudaDeviceSetMemPool_v11020_params_st {
+    int device;
+    cudaMemPool_t memPool;
+} cudaDeviceSetMemPool_v11020_params;
+
+typedef struct cudaDeviceGetMemPool_v11020_params_st {
+    cudaMemPool_t *memPool;
+    int device;
+} cudaDeviceGetMemPool_v11020_params;
+
+typedef struct cudaDeviceGetNvSciSyncAttributes_v10020_params_st {
+    void *nvSciSyncAttrList;
+    int device;
+    int flags;
+} cudaDeviceGetNvSciSyncAttributes_v10020_params;
+
+typedef struct cudaDeviceGetP2PAttribute_v8000_params_st {
+    int *value;
+    enum cudaDeviceP2PAttr attr;
+    int srcDevice;
+    int dstDevice;
+} cudaDeviceGetP2PAttribute_v8000_params;
+
+typedef struct cudaChooseDevice_v3020_params_st {
+    int *device;
+    const struct cudaDeviceProp *prop;
+} cudaChooseDevice_v3020_params;
+
+typedef struct cudaInitDevice_v12000_params_st {
+    int device;
+    unsigned int deviceFlags;
+    unsigned int flags;
+} cudaInitDevice_v12000_params;
+
+typedef struct cudaSetDevice_v3020_params_st {
+    int device;
+} cudaSetDevice_v3020_params;
+
+typedef struct cudaGetDevice_v3020_params_st {
+    int *device;
+} cudaGetDevice_v3020_params;
+
+typedef struct cudaSetValidDevices_v3020_params_st {
+    int *device_arr;
+    int len;
+} cudaSetValidDevices_v3020_params;
+
+typedef struct cudaSetDeviceFlags_v3020_params_st {
+    unsigned int flags;
+} cudaSetDeviceFlags_v3020_params;
+
+typedef struct cudaGetDeviceFlags_v7000_params_st {
+    unsigned int *flags;
+} cudaGetDeviceFlags_v7000_params;
+
+typedef struct cudaStreamCreate_v3020_params_st {
+    cudaStream_t *pStream;
+} cudaStreamCreate_v3020_params;
+
+typedef struct cudaStreamCreateWithFlags_v5000_params_st {
+    cudaStream_t *pStream;
+    unsigned int flags;
+} cudaStreamCreateWithFlags_v5000_params;
+
+typedef struct cudaStreamCreateWithPriority_v5050_params_st {
+    cudaStream_t *pStream;
+    unsigned int flags;
+    int priority;
+} cudaStreamCreateWithPriority_v5050_params;
+
+typedef struct cudaStreamGetPriority_ptsz_v7000_params_st {
+    cudaStream_t hStream;
+    int *priority;
+} cudaStreamGetPriority_ptsz_v7000_params;
+
+typedef struct cudaStreamGetFlags_ptsz_v7000_params_st {
+    cudaStream_t hStream;
+    unsigned int *flags;
+} cudaStreamGetFlags_ptsz_v7000_params;
+
+typedef struct cudaStreamGetId_ptsz_v12000_params_st {
+    cudaStream_t hStream;
+    unsigned long long *streamId;
+} cudaStreamGetId_ptsz_v12000_params;
+
+typedef struct cudaStreamGetDevice_ptsz_v12080_params_st {
+    cudaStream_t hStream;
+    int *device;
+} cudaStreamGetDevice_ptsz_v12080_params;
+
+typedef struct cudaStreamCopyAttributes_ptsz_v11000_params_st {
+    cudaStream_t dst;
+    cudaStream_t src;
+} cudaStreamCopyAttributes_ptsz_v11000_params;
+
+typedef struct cudaStreamGetAttribute_ptsz_v11000_params_st {
+    cudaStream_t hStream;
+    cudaStreamAttrID attr;
+    cudaStreamAttrValue *value_out;
+} cudaStreamGetAttribute_ptsz_v11000_params;
+
+typedef struct cudaStreamSetAttribute_ptsz_v11000_params_st {
+    cudaStream_t hStream;
+    cudaStreamAttrID attr;
+    const cudaStreamAttrValue *value;
+} cudaStreamSetAttribute_ptsz_v11000_params;
+
+typedef struct cudaStreamDestroy_v5050_params_st {
+    cudaStream_t stream;
+} cudaStreamDestroy_v5050_params;
+
+typedef struct cudaStreamWaitEvent_ptsz_v7000_params_st {
+    cudaStream_t stream;
+    cudaEvent_t event;
+    unsigned int flags;
+} cudaStreamWaitEvent_ptsz_v7000_params;
+
+typedef struct cudaStreamAddCallback_ptsz_v7000_params_st {
+    cudaStream_t stream;
+    cudaStreamCallback_t callback;
+    void *userData;
+    unsigned int flags;
+} cudaStreamAddCallback_ptsz_v7000_params;
+
+typedef struct cudaStreamSynchronize_ptsz_v7000_params_st {
+    cudaStream_t stream;
+} cudaStreamSynchronize_ptsz_v7000_params;
+
+typedef struct cudaStreamQuery_ptsz_v7000_params_st {
+    cudaStream_t stream;
+} cudaStreamQuery_ptsz_v7000_params;
+
+typedef struct cudaStreamAttachMemAsync_ptsz_v7000_params_st {
+    cudaStream_t stream;
+    void *devPtr;
+    size_t length;
+    unsigned int flags;
+} cudaStreamAttachMemAsync_ptsz_v7000_params;
+
+typedef struct cudaStreamBeginCapture_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureMode mode;
+} cudaStreamBeginCapture_ptsz_v10000_params;
+
+typedef struct cudaStreamBeginCaptureToGraph_ptsz_v12030_params_st {
+    cudaStream_t stream;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *dependencies;
+    const cudaGraphEdgeData *dependencyData;
+    size_t numDependencies;
+    enum cudaStreamCaptureMode mode;
+} cudaStreamBeginCaptureToGraph_ptsz_v12030_params;
+
+typedef struct cudaThreadExchangeStreamCaptureMode_v10010_params_st {
+    enum cudaStreamCaptureMode *mode;
+} cudaThreadExchangeStreamCaptureMode_v10010_params;
+
+typedef struct cudaStreamEndCapture_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    cudaGraph_t *pGraph;
+} cudaStreamEndCapture_ptsz_v10000_params;
+
+typedef struct cudaStreamIsCapturing_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *pCaptureStatus;
+} cudaStreamIsCapturing_ptsz_v10000_params;
+
+typedef struct cudaStreamGetCaptureInfo_v2_ptsz_v11030_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+    cudaGraph_t *graph_out;
+    const cudaGraphNode_t **dependencies_out;
+    size_t *numDependencies_out;
+} cudaStreamGetCaptureInfo_v2_ptsz_v11030_params;
+
+typedef struct cudaStreamGetCaptureInfo_v3_ptsz_v12030_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+    cudaGraph_t *graph_out;
+    const cudaGraphNode_t **dependencies_out;
+    const cudaGraphEdgeData **edgeData_out;
+    size_t *numDependencies_out;
+} cudaStreamGetCaptureInfo_v3_ptsz_v12030_params;
+
+typedef struct cudaStreamUpdateCaptureDependencies_ptsz_v11030_params_st {
+    cudaStream_t stream;
+    cudaGraphNode_t *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cudaStreamUpdateCaptureDependencies_ptsz_v11030_params;
+
+typedef struct cudaStreamUpdateCaptureDependencies_v2_ptsz_v12030_params_st {
+    cudaStream_t stream;
+    cudaGraphNode_t *dependencies;
+    const cudaGraphEdgeData *dependencyData;
+    size_t numDependencies;
+    unsigned int flags;
+} cudaStreamUpdateCaptureDependencies_v2_ptsz_v12030_params;
+
+typedef struct cudaEventCreate_v3020_params_st {
+    cudaEvent_t *event;
+} cudaEventCreate_v3020_params;
+
+typedef struct cudaEventCreateWithFlags_v3020_params_st {
+    cudaEvent_t *event;
+    unsigned int flags;
+} cudaEventCreateWithFlags_v3020_params;
+
+typedef struct cudaEventRecord_ptsz_v7000_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+} cudaEventRecord_ptsz_v7000_params;
+
+typedef struct cudaEventRecordWithFlags_ptsz_v11010_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+    unsigned int flags;
+} cudaEventRecordWithFlags_ptsz_v11010_params;
+
+typedef struct cudaEventQuery_v3020_params_st {
+    cudaEvent_t event;
+} cudaEventQuery_v3020_params;
+
+typedef struct cudaEventSynchronize_v3020_params_st {
+    cudaEvent_t event;
+} cudaEventSynchronize_v3020_params;
+
+typedef struct cudaEventDestroy_v3020_params_st {
+    cudaEvent_t event;
+} cudaEventDestroy_v3020_params;
+
+typedef struct cudaEventElapsedTime_v3020_params_st {
+    float *ms;
+    cudaEvent_t start;
+    cudaEvent_t end;
+} cudaEventElapsedTime_v3020_params;
+
+typedef struct cudaEventElapsedTime_v2_v12080_params_st {
+    float *ms;
+    cudaEvent_t start;
+    cudaEvent_t end;
+} cudaEventElapsedTime_v2_v12080_params;
+
+typedef struct cudaImportExternalMemory_v10000_params_st {
+    cudaExternalMemory_t *extMem_out;
+    const struct cudaExternalMemoryHandleDesc *memHandleDesc;
+} cudaImportExternalMemory_v10000_params;
+
+typedef struct cudaExternalMemoryGetMappedBuffer_v10000_params_st {
+    void **devPtr;
+    cudaExternalMemory_t extMem;
+    const struct cudaExternalMemoryBufferDesc *bufferDesc;
+} cudaExternalMemoryGetMappedBuffer_v10000_params;
+
+typedef struct cudaExternalMemoryGetMappedMipmappedArray_v10000_params_st {
+    cudaMipmappedArray_t *mipmap;
+    cudaExternalMemory_t extMem;
+    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc;
+} cudaExternalMemoryGetMappedMipmappedArray_v10000_params;
+
+typedef struct cudaDestroyExternalMemory_v10000_params_st {
+    cudaExternalMemory_t extMem;
+} cudaDestroyExternalMemory_v10000_params;
+
+typedef struct cudaImportExternalSemaphore_v10000_params_st {
+    cudaExternalSemaphore_t *extSem_out;
+    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc;
+} cudaImportExternalSemaphore_v10000_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020_params;
+
+typedef struct cudaDestroyExternalSemaphore_v10000_params_st {
+    cudaExternalSemaphore_t extSem;
+} cudaDestroyExternalSemaphore_v10000_params;
+
+typedef struct cudaLaunchKernel_ptsz_v7000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchKernel_ptsz_v7000_params;
+
+typedef struct cudaLaunchKernelExC_ptsz_v11060_params_st {
+    const cudaLaunchConfig_t *config;
+    const void *func;
+    void **args;
+} cudaLaunchKernelExC_ptsz_v11060_params;
+
+typedef struct cudaLaunchCooperativeKernel_ptsz_v9000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchCooperativeKernel_ptsz_v9000_params;
+
+typedef struct cudaLaunchCooperativeKernelMultiDevice_v9000_params_st {
+    struct cudaLaunchParams *launchParamsList;
+    unsigned int numDevices;
+    unsigned int flags;
+} cudaLaunchCooperativeKernelMultiDevice_v9000_params;
+
+typedef struct cudaFuncSetCacheConfig_v3020_params_st {
+    const void *func;
+    enum cudaFuncCache cacheConfig;
+} cudaFuncSetCacheConfig_v3020_params;
+
+typedef struct cudaFuncGetAttributes_v3020_params_st {
+    struct cudaFuncAttributes *attr;
+    const void *func;
+} cudaFuncGetAttributes_v3020_params;
+
+typedef struct cudaFuncSetAttribute_v9000_params_st {
+    const void *func;
+    enum cudaFuncAttribute attr;
+    int value;
+} cudaFuncSetAttribute_v9000_params;
+
+typedef struct cudaFuncGetName_v12030_params_st {
+    const char **name;
+    const void *func;
+} cudaFuncGetName_v12030_params;
+
+typedef struct cudaFuncGetParamInfo_v12040_params_st {
+    const void *func;
+    size_t paramIndex;
+    size_t *paramOffset;
+    size_t *paramSize;
+} cudaFuncGetParamInfo_v12040_params;
+
+typedef struct cudaLaunchHostFunc_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    cudaHostFn_t fn;
+    void *userData;
+} cudaLaunchHostFunc_ptsz_v10000_params;
+
+typedef struct cudaFuncSetSharedMemConfig_v4020_params_st {
+    const void *func;
+    enum cudaSharedMemConfig config;
+} cudaFuncSetSharedMemConfig_v4020_params;
+
+typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050_params_st {
+    int *numBlocks;
+    const void *func;
+    int blockSize;
+    size_t dynamicSMemSize;
+} cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050_params;
+
+typedef struct cudaOccupancyAvailableDynamicSMemPerBlock_v10200_params_st {
+    size_t *dynamicSmemSize;
+    const void *func;
+    int numBlocks;
+    int blockSize;
+} cudaOccupancyAvailableDynamicSMemPerBlock_v10200_params;
+
+typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000_params_st {
+    int *numBlocks;
+    const void *func;
+    int blockSize;
+    size_t dynamicSMemSize;
+    unsigned int flags;
+} cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000_params;
+
+typedef struct cudaOccupancyMaxPotentialClusterSize_v11070_params_st {
+    int *clusterSize;
+    const void *func;
+    const cudaLaunchConfig_t *launchConfig;
+} cudaOccupancyMaxPotentialClusterSize_v11070_params;
+
+typedef struct cudaOccupancyMaxActiveClusters_v11070_params_st {
+    int *numClusters;
+    const void *func;
+    const cudaLaunchConfig_t *launchConfig;
+} cudaOccupancyMaxActiveClusters_v11070_params;
+
+typedef struct cudaMallocManaged_v6000_params_st {
+    void **devPtr;
+    size_t size;
+    unsigned int flags;
+} cudaMallocManaged_v6000_params;
+
+typedef struct cudaMalloc_v3020_params_st {
+    void **devPtr;
+    size_t size;
+} cudaMalloc_v3020_params;
+
+typedef struct cudaMallocHost_v3020_params_st {
+    void **ptr;
+    size_t size;
+} cudaMallocHost_v3020_params;
+
+typedef struct cudaMallocPitch_v3020_params_st {
+    void **devPtr;
+    size_t *pitch;
+    size_t width;
+    size_t height;
+} cudaMallocPitch_v3020_params;
+
+typedef struct cudaMallocArray_v3020_params_st {
+    cudaArray_t *array;
+    const struct cudaChannelFormatDesc *desc;
+    size_t width;
+    size_t height;
+    unsigned int flags;
+} cudaMallocArray_v3020_params;
+
+typedef struct cudaFree_v3020_params_st {
+    void *devPtr;
+} cudaFree_v3020_params;
+
+typedef struct cudaFreeHost_v3020_params_st {
+    void *ptr;
+} cudaFreeHost_v3020_params;
+
+typedef struct cudaFreeArray_v3020_params_st {
+    cudaArray_t array;
+} cudaFreeArray_v3020_params;
+
+typedef struct cudaFreeMipmappedArray_v5000_params_st {
+    cudaMipmappedArray_t mipmappedArray;
+} cudaFreeMipmappedArray_v5000_params;
+
+typedef struct cudaHostAlloc_v3020_params_st {
+    void **pHost;
+    size_t size;
+    unsigned int flags;
+} cudaHostAlloc_v3020_params;
+
+typedef struct cudaHostRegister_v4000_params_st {
+    void *ptr;
+    size_t size;
+    unsigned int flags;
+} cudaHostRegister_v4000_params;
+
+typedef struct cudaHostUnregister_v4000_params_st {
+    void *ptr;
+} cudaHostUnregister_v4000_params;
+
+typedef struct cudaHostGetDevicePointer_v3020_params_st {
+    void **pDevice;
+    void *pHost;
+    unsigned int flags;
+} cudaHostGetDevicePointer_v3020_params;
+
+typedef struct cudaHostGetFlags_v3020_params_st {
+    unsigned int *pFlags;
+    void *pHost;
+} cudaHostGetFlags_v3020_params;
+
+typedef struct cudaMalloc3D_v3020_params_st {
+    struct cudaPitchedPtr *pitchedDevPtr;
+    struct cudaExtent extent;
+} cudaMalloc3D_v3020_params;
+
+typedef struct cudaMalloc3DArray_v3020_params_st {
+    cudaArray_t *array;
+    const struct cudaChannelFormatDesc *desc;
+    struct cudaExtent extent;
+    unsigned int flags;
+} cudaMalloc3DArray_v3020_params;
+
+typedef struct cudaMallocMipmappedArray_v5000_params_st {
+    cudaMipmappedArray_t *mipmappedArray;
+    const struct cudaChannelFormatDesc *desc;
+    struct cudaExtent extent;
+    unsigned int numLevels;
+    unsigned int flags;
+} cudaMallocMipmappedArray_v5000_params;
+
+typedef struct cudaGetMipmappedArrayLevel_v5000_params_st {
+    cudaArray_t *levelArray;
+    cudaMipmappedArray_const_t mipmappedArray;
+    unsigned int level;
+} cudaGetMipmappedArrayLevel_v5000_params;
+
+typedef struct cudaMemcpy3D_ptds_v7000_params_st {
+    const struct cudaMemcpy3DParms *p;
+} cudaMemcpy3D_ptds_v7000_params;
+
+typedef struct cudaMemcpy3DPeer_ptds_v7000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+} cudaMemcpy3DPeer_ptds_v7000_params;
+
+typedef struct cudaMemcpy3DAsync_ptsz_v7000_params_st {
+    const struct cudaMemcpy3DParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpy3DPeerAsync_ptsz_v7000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DPeerAsync_ptsz_v7000_params;
+
+typedef struct cudaMemGetInfo_v3020_params_st {
+    size_t *free;
+    size_t *total;
+} cudaMemGetInfo_v3020_params;
+
+typedef struct cudaArrayGetInfo_v4010_params_st {
+    struct cudaChannelFormatDesc *desc;
+    struct cudaExtent *extent;
+    unsigned int *flags;
+    cudaArray_t array;
+} cudaArrayGetInfo_v4010_params;
+
+typedef struct cudaArrayGetPlane_v11020_params_st {
+    cudaArray_t *pPlaneArray;
+    cudaArray_t hArray;
+    unsigned int planeIdx;
+} cudaArrayGetPlane_v11020_params;
+
+typedef struct cudaArrayGetMemoryRequirements_v11060_params_st {
+    struct cudaArrayMemoryRequirements *memoryRequirements;
+    cudaArray_t array;
+    int device;
+} cudaArrayGetMemoryRequirements_v11060_params;
+
+typedef struct cudaMipmappedArrayGetMemoryRequirements_v11060_params_st {
+    struct cudaArrayMemoryRequirements *memoryRequirements;
+    cudaMipmappedArray_t mipmap;
+    int device;
+} cudaMipmappedArrayGetMemoryRequirements_v11060_params;
+
+typedef struct cudaArrayGetSparseProperties_v11010_params_st {
+    struct cudaArraySparseProperties *sparseProperties;
+    cudaArray_t array;
+} cudaArrayGetSparseProperties_v11010_params;
+
+typedef struct cudaMipmappedArrayGetSparseProperties_v11010_params_st {
+    struct cudaArraySparseProperties *sparseProperties;
+    cudaMipmappedArray_t mipmap;
+} cudaMipmappedArrayGetSparseProperties_v11010_params;
+
+typedef struct cudaMemcpy_ptds_v7000_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy_ptds_v7000_params;
+
+typedef struct cudaMemcpyPeer_v4000_params_st {
+    void *dst;
+    int dstDevice;
+    const void *src;
+    int srcDevice;
+    size_t count;
+} cudaMemcpyPeer_v4000_params;
+
+typedef struct cudaMemcpy2D_ptds_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2D_ptds_v7000_params;
+
+typedef struct cudaMemcpy2DToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpy2DFromArray_ptds_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DFromArray_ptds_v7000_params;
+
+typedef struct cudaMemcpy2DArrayToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DArrayToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyToSymbol_ptds_v7000_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToSymbol_ptds_v7000_params;
+
+typedef struct cudaMemcpyFromSymbol_ptds_v7000_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromSymbol_ptds_v7000_params;
+
+typedef struct cudaMemcpyAsync_ptsz_v7000_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyPeerAsync_v4000_params_st {
+    void *dst;
+    int dstDevice;
+    const void *src;
+    int srcDevice;
+    size_t count;
+    cudaStream_t stream;
+} cudaMemcpyPeerAsync_v4000_params;
+
+typedef struct cudaMemcpyBatchAsync_ptsz_v12080_params_st {
+    void **dsts;
+    void **srcs;
+    size_t *sizes;
+    size_t count;
+    struct cudaMemcpyAttributes *attrs;
+    size_t *attrsIdxs;
+    size_t numAttrs;
+    size_t *failIdx;
+    cudaStream_t stream;
+} cudaMemcpyBatchAsync_ptsz_v12080_params;
+
+typedef struct cudaMemcpy3DBatchAsync_ptsz_v12080_params_st {
+    size_t numOps;
+    struct cudaMemcpy3DBatchOp *opList;
+    size_t *failIdx;
+    unsigned long long flags;
+    cudaStream_t stream;
+} cudaMemcpy3DBatchAsync_ptsz_v12080_params;
+
+typedef struct cudaMemcpy2DAsync_ptsz_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpy2DToArrayAsync_ptsz_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DToArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpy2DFromArrayAsync_ptsz_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DFromArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyToSymbolAsync_ptsz_v7000_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToSymbolAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyFromSymbolAsync_ptsz_v7000_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromSymbolAsync_ptsz_v7000_params;
+
+typedef struct cudaMemset_ptds_v7000_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+} cudaMemset_ptds_v7000_params;
+
+typedef struct cudaMemset2D_ptds_v7000_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+} cudaMemset2D_ptds_v7000_params;
+
+typedef struct cudaMemset3D_ptds_v7000_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+} cudaMemset3D_ptds_v7000_params;
+
+typedef struct cudaMemsetAsync_ptsz_v7000_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+    cudaStream_t stream;
+} cudaMemsetAsync_ptsz_v7000_params;
+
+typedef struct cudaMemset2DAsync_ptsz_v7000_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+    cudaStream_t stream;
+} cudaMemset2DAsync_ptsz_v7000_params;
+
+typedef struct cudaMemset3DAsync_ptsz_v7000_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+    cudaStream_t stream;
+} cudaMemset3DAsync_ptsz_v7000_params;
+
+typedef struct cudaGetSymbolAddress_v3020_params_st {
+    void **devPtr;
+    const void *symbol;
+} cudaGetSymbolAddress_v3020_params;
+
+typedef struct cudaGetSymbolSize_v3020_params_st {
+    size_t *size;
+    const void *symbol;
+} cudaGetSymbolSize_v3020_params;
+
+typedef struct cudaMemPrefetchAsync_ptsz_v8000_params_st {
+    const void *devPtr;
+    size_t count;
+    int dstDevice;
+    cudaStream_t stream;
+} cudaMemPrefetchAsync_ptsz_v8000_params;
+
+typedef struct cudaMemPrefetchAsync_v2_ptsz_v12020_params_st {
+    const void *devPtr;
+    size_t count;
+    struct cudaMemLocation location;
+    unsigned int flags;
+    cudaStream_t stream;
+} cudaMemPrefetchAsync_v2_ptsz_v12020_params;
+
+typedef struct cudaMemAdvise_v8000_params_st {
+    const void *devPtr;
+    size_t count;
+    enum cudaMemoryAdvise advice;
+    int device;
+} cudaMemAdvise_v8000_params;
+
+typedef struct cudaMemAdvise_v2_v12020_params_st {
+    const void *devPtr;
+    size_t count;
+    enum cudaMemoryAdvise advice;
+    struct cudaMemLocation location;
+} cudaMemAdvise_v2_v12020_params;
+
+typedef struct cudaMemRangeGetAttribute_v8000_params_st {
+    void *data;
+    size_t dataSize;
+    enum cudaMemRangeAttribute attribute;
+    const void *devPtr;
+    size_t count;
+} cudaMemRangeGetAttribute_v8000_params;
+
+typedef struct cudaMemRangeGetAttributes_v8000_params_st {
+    void **data;
+    size_t *dataSizes;
+    enum cudaMemRangeAttribute *attributes;
+    size_t numAttributes;
+    const void *devPtr;
+    size_t count;
+} cudaMemRangeGetAttributes_v8000_params;
+
+typedef struct cudaMemcpyToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyFromArray_ptds_v7000_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyArrayToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyArrayToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyToArrayAsync_ptsz_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyFromArrayAsync_ptsz_v7000_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMallocAsync_ptsz_v11020_params_st {
+    void **devPtr;
+    size_t size;
+    cudaStream_t hStream;
+} cudaMallocAsync_ptsz_v11020_params;
+
+typedef struct cudaFreeAsync_ptsz_v11020_params_st {
+    void *devPtr;
+    cudaStream_t hStream;
+} cudaFreeAsync_ptsz_v11020_params;
+
+typedef struct cudaMemPoolTrimTo_v11020_params_st {
+    cudaMemPool_t memPool;
+    size_t minBytesToKeep;
+} cudaMemPoolTrimTo_v11020_params;
+
+typedef struct cudaMemPoolSetAttribute_v11020_params_st {
+    cudaMemPool_t memPool;
+    enum cudaMemPoolAttr attr;
+    void *value;
+} cudaMemPoolSetAttribute_v11020_params;
+
+typedef struct cudaMemPoolGetAttribute_v11020_params_st {
+    cudaMemPool_t memPool;
+    enum cudaMemPoolAttr attr;
+    void *value;
+} cudaMemPoolGetAttribute_v11020_params;
+
+typedef struct cudaMemPoolSetAccess_v11020_params_st {
+    cudaMemPool_t memPool;
+    const struct cudaMemAccessDesc *descList;
+    size_t count;
+} cudaMemPoolSetAccess_v11020_params;
+
+typedef struct cudaMemPoolGetAccess_v11020_params_st {
+    enum cudaMemAccessFlags *flags;
+    cudaMemPool_t memPool;
+    struct cudaMemLocation *location;
+} cudaMemPoolGetAccess_v11020_params;
+
+typedef struct cudaMemPoolCreate_v11020_params_st {
+    cudaMemPool_t *memPool;
+    const struct cudaMemPoolProps *poolProps;
+} cudaMemPoolCreate_v11020_params;
+
+typedef struct cudaMemPoolDestroy_v11020_params_st {
+    cudaMemPool_t memPool;
+} cudaMemPoolDestroy_v11020_params;
+
+typedef struct cudaMallocFromPoolAsync_ptsz_v11020_params_st {
+    void **ptr;
+    size_t size;
+    cudaMemPool_t memPool;
+    cudaStream_t stream;
+} cudaMallocFromPoolAsync_ptsz_v11020_params;
+
+typedef struct cudaMemPoolExportToShareableHandle_v11020_params_st {
+    void *shareableHandle;
+    cudaMemPool_t memPool;
+    enum cudaMemAllocationHandleType handleType;
+    unsigned int flags;
+} cudaMemPoolExportToShareableHandle_v11020_params;
+
+typedef struct cudaMemPoolImportFromShareableHandle_v11020_params_st {
+    cudaMemPool_t *memPool;
+    void *shareableHandle;
+    enum cudaMemAllocationHandleType handleType;
+    unsigned int flags;
+} cudaMemPoolImportFromShareableHandle_v11020_params;
+
+typedef struct cudaMemPoolExportPointer_v11020_params_st {
+    struct cudaMemPoolPtrExportData *exportData;
+    void *ptr;
+} cudaMemPoolExportPointer_v11020_params;
+
+typedef struct cudaMemPoolImportPointer_v11020_params_st {
+    void **ptr;
+    cudaMemPool_t memPool;
+    struct cudaMemPoolPtrExportData *exportData;
+} cudaMemPoolImportPointer_v11020_params;
+
+typedef struct cudaPointerGetAttributes_v4000_params_st {
+    struct cudaPointerAttributes *attributes;
+    const void *ptr;
+} cudaPointerGetAttributes_v4000_params;
+
+typedef struct cudaDeviceCanAccessPeer_v4000_params_st {
+    int *canAccessPeer;
+    int device;
+    int peerDevice;
+} cudaDeviceCanAccessPeer_v4000_params;
+
+typedef struct cudaDeviceEnablePeerAccess_v4000_params_st {
+    int peerDevice;
+    unsigned int flags;
+} cudaDeviceEnablePeerAccess_v4000_params;
+
+typedef struct cudaDeviceDisablePeerAccess_v4000_params_st {
+    int peerDevice;
+} cudaDeviceDisablePeerAccess_v4000_params;
+
+typedef struct cudaGraphicsUnregisterResource_v3020_params_st {
+    cudaGraphicsResource_t resource;
+} cudaGraphicsUnregisterResource_v3020_params;
+
+typedef struct cudaGraphicsResourceSetMapFlags_v3020_params_st {
+    cudaGraphicsResource_t resource;
+    unsigned int flags;
+} cudaGraphicsResourceSetMapFlags_v3020_params;
+
+typedef struct cudaGraphicsMapResources_v3020_params_st {
+    int count;
+    cudaGraphicsResource_t *resources;
+    cudaStream_t stream;
+} cudaGraphicsMapResources_v3020_params;
+
+typedef struct cudaGraphicsUnmapResources_v3020_params_st {
+    int count;
+    cudaGraphicsResource_t *resources;
+    cudaStream_t stream;
+} cudaGraphicsUnmapResources_v3020_params;
+
+typedef struct cudaGraphicsResourceGetMappedPointer_v3020_params_st {
+    void **devPtr;
+    size_t *size;
+    cudaGraphicsResource_t resource;
+} cudaGraphicsResourceGetMappedPointer_v3020_params;
+
+typedef struct cudaGraphicsSubResourceGetMappedArray_v3020_params_st {
+    cudaArray_t *array;
+    cudaGraphicsResource_t resource;
+    unsigned int arrayIndex;
+    unsigned int mipLevel;
+} cudaGraphicsSubResourceGetMappedArray_v3020_params;
+
+typedef struct cudaGraphicsResourceGetMappedMipmappedArray_v5000_params_st {
+    cudaMipmappedArray_t *mipmappedArray;
+    cudaGraphicsResource_t resource;
+} cudaGraphicsResourceGetMappedMipmappedArray_v5000_params;
+
+typedef struct cudaGetChannelDesc_v3020_params_st {
+    struct cudaChannelFormatDesc *desc;
+    cudaArray_const_t array;
+} cudaGetChannelDesc_v3020_params;
+
+typedef struct cudaCreateChannelDesc_v3020_params_st {
+    int x;
+    int y;
+    int z;
+    int w;
+    enum cudaChannelFormatKind f;
+} cudaCreateChannelDesc_v3020_params;
+
+typedef struct cudaCreateTextureObject_v5000_params_st {
+    cudaTextureObject_t *pTexObject;
+    const struct cudaResourceDesc *pResDesc;
+    const struct cudaTextureDesc *pTexDesc;
+    const struct cudaResourceViewDesc *pResViewDesc;
+} cudaCreateTextureObject_v5000_params;
+
+typedef struct cudaDestroyTextureObject_v5000_params_st {
+    cudaTextureObject_t texObject;
+} cudaDestroyTextureObject_v5000_params;
+
+typedef struct cudaGetTextureObjectResourceDesc_v5000_params_st {
+    struct cudaResourceDesc *pResDesc;
+    cudaTextureObject_t texObject;
+} cudaGetTextureObjectResourceDesc_v5000_params;
+
+typedef struct cudaGetTextureObjectTextureDesc_v5000_params_st {
+    struct cudaTextureDesc *pTexDesc;
+    cudaTextureObject_t texObject;
+} cudaGetTextureObjectTextureDesc_v5000_params;
+
+typedef struct cudaGetTextureObjectResourceViewDesc_v5000_params_st {
+    struct cudaResourceViewDesc *pResViewDesc;
+    cudaTextureObject_t texObject;
+} cudaGetTextureObjectResourceViewDesc_v5000_params;
+
+typedef struct cudaCreateSurfaceObject_v5000_params_st {
+    cudaSurfaceObject_t *pSurfObject;
+    const struct cudaResourceDesc *pResDesc;
+} cudaCreateSurfaceObject_v5000_params;
+
+typedef struct cudaDestroySurfaceObject_v5000_params_st {
+    cudaSurfaceObject_t surfObject;
+} cudaDestroySurfaceObject_v5000_params;
+
+typedef struct cudaGetSurfaceObjectResourceDesc_v5000_params_st {
+    struct cudaResourceDesc *pResDesc;
+    cudaSurfaceObject_t surfObject;
+} cudaGetSurfaceObjectResourceDesc_v5000_params;
+
+typedef struct cudaDriverGetVersion_v3020_params_st {
+    int *driverVersion;
+} cudaDriverGetVersion_v3020_params;
+
+typedef struct cudaRuntimeGetVersion_v3020_params_st {
+    int *runtimeVersion;
+} cudaRuntimeGetVersion_v3020_params;
+
+typedef struct cudaGraphCreate_v10000_params_st {
+    cudaGraph_t *pGraph;
+    unsigned int flags;
+} cudaGraphCreate_v10000_params;
+
+typedef struct cudaGraphAddKernelNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphAddKernelNode_v10000_params;
+
+typedef struct cudaGraphKernelNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphKernelNodeGetParams_v10000_params;
+
+typedef struct cudaGraphKernelNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphKernelNodeSetParams_v10000_params;
+
+typedef struct cudaGraphKernelNodeCopyAttributes_v11000_params_st {
+    cudaGraphNode_t hSrc;
+    cudaGraphNode_t hDst;
+} cudaGraphKernelNodeCopyAttributes_v11000_params;
+
+typedef struct cudaGraphKernelNodeGetAttribute_v11000_params_st {
+    cudaGraphNode_t hNode;
+    cudaKernelNodeAttrID attr;
+    cudaKernelNodeAttrValue *value_out;
+} cudaGraphKernelNodeGetAttribute_v11000_params;
+
+typedef struct cudaGraphKernelNodeSetAttribute_v11000_params_st {
+    cudaGraphNode_t hNode;
+    cudaKernelNodeAttrID attr;
+    const cudaKernelNodeAttrValue *value;
+} cudaGraphKernelNodeSetAttribute_v11000_params;
+
+typedef struct cudaGraphAddMemcpyNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaMemcpy3DParms *pCopyParams;
+} cudaGraphAddMemcpyNode_v10000_params;
+
+typedef struct cudaGraphAddMemcpyNodeToSymbol_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphAddMemcpyNodeToSymbol_v11010_params;
+
+typedef struct cudaGraphAddMemcpyNodeFromSymbol_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphAddMemcpyNodeFromSymbol_v11010_params;
+
+typedef struct cudaGraphAddMemcpyNode1D_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaGraphAddMemcpyNode1D_v11010_params;
+
+typedef struct cudaGraphMemcpyNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaMemcpy3DParms *pNodeParams;
+} cudaGraphMemcpyNodeGetParams_v10000_params;
+
+typedef struct cudaGraphMemcpyNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaMemcpy3DParms *pNodeParams;
+} cudaGraphMemcpyNodeSetParams_v10000_params;
+
+typedef struct cudaGraphMemcpyNodeSetParamsToSymbol_v11010_params_st {
+    cudaGraphNode_t node;
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphMemcpyNodeSetParamsToSymbol_v11010_params;
+
+typedef struct cudaGraphMemcpyNodeSetParamsFromSymbol_v11010_params_st {
+    cudaGraphNode_t node;
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphMemcpyNodeSetParamsFromSymbol_v11010_params;
+
+typedef struct cudaGraphMemcpyNodeSetParams1D_v11010_params_st {
+    cudaGraphNode_t node;
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaGraphMemcpyNodeSetParams1D_v11010_params;
+
+typedef struct cudaGraphAddMemsetNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaMemsetParams *pMemsetParams;
+} cudaGraphAddMemsetNode_v10000_params;
+
+typedef struct cudaGraphMemsetNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaMemsetParams *pNodeParams;
+} cudaGraphMemsetNodeGetParams_v10000_params;
+
+typedef struct cudaGraphMemsetNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaMemsetParams *pNodeParams;
+} cudaGraphMemsetNodeSetParams_v10000_params;
+
+typedef struct cudaGraphAddHostNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaHostNodeParams *pNodeParams;
+} cudaGraphAddHostNode_v10000_params;
+
+typedef struct cudaGraphHostNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaHostNodeParams *pNodeParams;
+} cudaGraphHostNodeGetParams_v10000_params;
+
+typedef struct cudaGraphHostNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaHostNodeParams *pNodeParams;
+} cudaGraphHostNodeSetParams_v10000_params;
+
+typedef struct cudaGraphAddChildGraphNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    cudaGraph_t childGraph;
+} cudaGraphAddChildGraphNode_v10000_params;
+
+typedef struct cudaGraphChildGraphNodeGetGraph_v10000_params_st {
+    cudaGraphNode_t node;
+    cudaGraph_t *pGraph;
+} cudaGraphChildGraphNodeGetGraph_v10000_params;
+
+typedef struct cudaGraphAddEmptyNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+} cudaGraphAddEmptyNode_v10000_params;
+
+typedef struct cudaGraphAddEventRecordNode_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    cudaEvent_t event;
+} cudaGraphAddEventRecordNode_v11010_params;
+
+typedef struct cudaGraphEventRecordNodeGetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t *event_out;
+} cudaGraphEventRecordNodeGetEvent_v11010_params;
+
+typedef struct cudaGraphEventRecordNodeSetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t event;
+} cudaGraphEventRecordNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphAddEventWaitNode_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    cudaEvent_t event;
+} cudaGraphAddEventWaitNode_v11010_params;
+
+typedef struct cudaGraphEventWaitNodeGetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t *event_out;
+} cudaGraphEventWaitNodeGetEvent_v11010_params;
+
+typedef struct cudaGraphEventWaitNodeSetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t event;
+} cudaGraphEventWaitNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphAddExternalSemaphoresSignalNode_v11020_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams;
+} cudaGraphAddExternalSemaphoresSignalNode_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresSignalNodeGetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    struct cudaExternalSemaphoreSignalNodeParams *params_out;
+} cudaGraphExternalSemaphoresSignalNodeGetParams_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresSignalNodeSetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams;
+} cudaGraphExternalSemaphoresSignalNodeSetParams_v11020_params;
+
+typedef struct cudaGraphAddExternalSemaphoresWaitNode_v11020_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams;
+} cudaGraphAddExternalSemaphoresWaitNode_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresWaitNodeGetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    struct cudaExternalSemaphoreWaitNodeParams *params_out;
+} cudaGraphExternalSemaphoresWaitNodeGetParams_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresWaitNodeSetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams;
+} cudaGraphExternalSemaphoresWaitNodeSetParams_v11020_params;
+
+typedef struct cudaGraphAddMemAllocNode_v11040_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    struct cudaMemAllocNodeParams *nodeParams;
+} cudaGraphAddMemAllocNode_v11040_params;
+
+typedef struct cudaGraphMemAllocNodeGetParams_v11040_params_st {
+    cudaGraphNode_t node;
+    struct cudaMemAllocNodeParams *params_out;
+} cudaGraphMemAllocNodeGetParams_v11040_params;
+
+typedef struct cudaGraphAddMemFreeNode_v11040_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    void *dptr;
+} cudaGraphAddMemFreeNode_v11040_params;
+
+typedef struct cudaGraphMemFreeNodeGetParams_v11040_params_st {
+    cudaGraphNode_t node;
+    void *dptr_out;
+} cudaGraphMemFreeNodeGetParams_v11040_params;
+
+typedef struct cudaDeviceGraphMemTrim_v11040_params_st {
+    int device;
+} cudaDeviceGraphMemTrim_v11040_params;
+
+typedef struct cudaDeviceGetGraphMemAttribute_v11040_params_st {
+    int device;
+    enum cudaGraphMemAttributeType attr;
+    void *value;
+} cudaDeviceGetGraphMemAttribute_v11040_params;
+
+typedef struct cudaDeviceSetGraphMemAttribute_v11040_params_st {
+    int device;
+    enum cudaGraphMemAttributeType attr;
+    void *value;
+} cudaDeviceSetGraphMemAttribute_v11040_params;
+
+typedef struct cudaGraphClone_v10000_params_st {
+    cudaGraph_t *pGraphClone;
+    cudaGraph_t originalGraph;
+} cudaGraphClone_v10000_params;
+
+typedef struct cudaGraphNodeFindInClone_v10000_params_st {
+    cudaGraphNode_t *pNode;
+    cudaGraphNode_t originalNode;
+    cudaGraph_t clonedGraph;
+} cudaGraphNodeFindInClone_v10000_params;
+
+typedef struct cudaGraphNodeGetType_v10000_params_st {
+    cudaGraphNode_t node;
+    enum cudaGraphNodeType *pType;
+} cudaGraphNodeGetType_v10000_params;
+
+typedef struct cudaGraphGetNodes_v10000_params_st {
+    cudaGraph_t graph;
+    cudaGraphNode_t *nodes;
+    size_t *numNodes;
+} cudaGraphGetNodes_v10000_params;
+
+typedef struct cudaGraphGetRootNodes_v10000_params_st {
+    cudaGraph_t graph;
+    cudaGraphNode_t *pRootNodes;
+    size_t *pNumRootNodes;
+} cudaGraphGetRootNodes_v10000_params;
+
+typedef struct cudaGraphGetEdges_v10000_params_st {
+    cudaGraph_t graph;
+    cudaGraphNode_t *from;
+    cudaGraphNode_t *to;
+    size_t *numEdges;
+} cudaGraphGetEdges_v10000_params;
+
+typedef struct cudaGraphGetEdges_v2_v12030_params_st {
+    cudaGraph_t graph;
+    cudaGraphNode_t *from;
+    cudaGraphNode_t *to;
+    cudaGraphEdgeData *edgeData;
+    size_t *numEdges;
+} cudaGraphGetEdges_v2_v12030_params;
+
+typedef struct cudaGraphNodeGetDependencies_v10000_params_st {
+    cudaGraphNode_t node;
+    cudaGraphNode_t *pDependencies;
+    size_t *pNumDependencies;
+} cudaGraphNodeGetDependencies_v10000_params;
+
+typedef struct cudaGraphNodeGetDependencies_v2_v12030_params_st {
+    cudaGraphNode_t node;
+    cudaGraphNode_t *pDependencies;
+    cudaGraphEdgeData *edgeData;
+    size_t *pNumDependencies;
+} cudaGraphNodeGetDependencies_v2_v12030_params;
+
+typedef struct cudaGraphNodeGetDependentNodes_v10000_params_st {
+    cudaGraphNode_t node;
+    cudaGraphNode_t *pDependentNodes;
+    size_t *pNumDependentNodes;
+} cudaGraphNodeGetDependentNodes_v10000_params;
+
+typedef struct cudaGraphNodeGetDependentNodes_v2_v12030_params_st {
+    cudaGraphNode_t node;
+    cudaGraphNode_t *pDependentNodes;
+    cudaGraphEdgeData *edgeData;
+    size_t *pNumDependentNodes;
+} cudaGraphNodeGetDependentNodes_v2_v12030_params;
+
+typedef struct cudaGraphAddDependencies_v10000_params_st {
+    cudaGraph_t graph;
+    const cudaGraphNode_t *from;
+    const cudaGraphNode_t *to;
+    size_t numDependencies;
+} cudaGraphAddDependencies_v10000_params;
+
+typedef struct cudaGraphAddDependencies_v2_v12030_params_st {
+    cudaGraph_t graph;
+    const cudaGraphNode_t *from;
+    const cudaGraphNode_t *to;
+    const cudaGraphEdgeData *edgeData;
+    size_t numDependencies;
+} cudaGraphAddDependencies_v2_v12030_params;
+
+typedef struct cudaGraphRemoveDependencies_v10000_params_st {
+    cudaGraph_t graph;
+    const cudaGraphNode_t *from;
+    const cudaGraphNode_t *to;
+    size_t numDependencies;
+} cudaGraphRemoveDependencies_v10000_params;
+
+typedef struct cudaGraphRemoveDependencies_v2_v12030_params_st {
+    cudaGraph_t graph;
+    const cudaGraphNode_t *from;
+    const cudaGraphNode_t *to;
+    const cudaGraphEdgeData *edgeData;
+    size_t numDependencies;
+} cudaGraphRemoveDependencies_v2_v12030_params;
+
+typedef struct cudaGraphDestroyNode_v10000_params_st {
+    cudaGraphNode_t node;
+} cudaGraphDestroyNode_v10000_params;
+
+typedef struct cudaGraphInstantiate_v12000_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    unsigned long long flags;
+} cudaGraphInstantiate_v12000_params;
+
+typedef struct cudaGraphInstantiateWithFlags_v11040_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    unsigned long long flags;
+} cudaGraphInstantiateWithFlags_v11040_params;
+
+typedef struct cudaGraphInstantiateWithParams_ptsz_v12000_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    cudaGraphInstantiateParams *instantiateParams;
+} cudaGraphInstantiateWithParams_ptsz_v12000_params;
+
+typedef struct cudaGraphExecGetFlags_v12000_params_st {
+    cudaGraphExec_t graphExec;
+    unsigned long long *flags;
+} cudaGraphExecGetFlags_v12000_params;
+
+typedef struct cudaGraphExecKernelNodeSetParams_v10010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphExecKernelNodeSetParams_v10010_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParams_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaMemcpy3DParms *pNodeParams;
+} cudaGraphExecMemcpyNodeSetParams_v10020_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParams1D_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaGraphExecMemcpyNodeSetParams1D_v11010_params;
+
+typedef struct cudaGraphExecMemsetNodeSetParams_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaMemsetParams *pNodeParams;
+} cudaGraphExecMemsetNodeSetParams_v10020_params;
+
+typedef struct cudaGraphExecHostNodeSetParams_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaHostNodeParams *pNodeParams;
+} cudaGraphExecHostNodeSetParams_v10020_params;
+
+typedef struct cudaGraphExecChildGraphNodeSetParams_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    cudaGraph_t childGraph;
+} cudaGraphExecChildGraphNodeSetParams_v11010_params;
+
+typedef struct cudaGraphExecEventRecordNodeSetEvent_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    cudaEvent_t event;
+} cudaGraphExecEventRecordNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphExecEventWaitNodeSetEvent_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    cudaEvent_t event;
+} cudaGraphExecEventWaitNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams;
+} cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020_params;
+
+typedef struct cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams;
+} cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020_params;
+
+typedef struct cudaGraphNodeSetEnabled_v11060_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    unsigned int isEnabled;
+} cudaGraphNodeSetEnabled_v11060_params;
+
+typedef struct cudaGraphNodeGetEnabled_v11060_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    unsigned int *isEnabled;
+} cudaGraphNodeGetEnabled_v11060_params;
+
+typedef struct cudaGraphExecUpdate_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraph_t hGraph;
+    cudaGraphExecUpdateResultInfo *resultInfo;
+} cudaGraphExecUpdate_v10020_params;
+
+typedef struct cudaGraphUpload_ptsz_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphUpload_ptsz_v10000_params;
+
+typedef struct cudaGraphLaunch_ptsz_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphLaunch_ptsz_v10000_params;
+
+typedef struct cudaGraphExecDestroy_v10000_params_st {
+    cudaGraphExec_t graphExec;
+} cudaGraphExecDestroy_v10000_params;
+
+typedef struct cudaGraphDestroy_v10000_params_st {
+    cudaGraph_t graph;
+} cudaGraphDestroy_v10000_params;
+
+typedef struct cudaGraphDebugDotPrint_v11030_params_st {
+    cudaGraph_t graph;
+    const char *path;
+    unsigned int flags;
+} cudaGraphDebugDotPrint_v11030_params;
+
+typedef struct cudaUserObjectCreate_v11030_params_st {
+    cudaUserObject_t *object_out;
+    void *ptr;
+    cudaHostFn_t destroy;
+    unsigned int initialRefcount;
+    unsigned int flags;
+} cudaUserObjectCreate_v11030_params;
+
+typedef struct cudaUserObjectRetain_v11030_params_st {
+    cudaUserObject_t object;
+    unsigned int count;
+} cudaUserObjectRetain_v11030_params;
+
+typedef struct cudaUserObjectRelease_v11030_params_st {
+    cudaUserObject_t object;
+    unsigned int count;
+} cudaUserObjectRelease_v11030_params;
+
+typedef struct cudaGraphRetainUserObject_v11030_params_st {
+    cudaGraph_t graph;
+    cudaUserObject_t object;
+    unsigned int count;
+    unsigned int flags;
+} cudaGraphRetainUserObject_v11030_params;
+
+typedef struct cudaGraphReleaseUserObject_v11030_params_st {
+    cudaGraph_t graph;
+    cudaUserObject_t object;
+    unsigned int count;
+} cudaGraphReleaseUserObject_v11030_params;
+
+typedef struct cudaGraphAddNode_v12020_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    struct cudaGraphNodeParams *nodeParams;
+} cudaGraphAddNode_v12020_params;
+
+typedef struct cudaGraphAddNode_v2_v12030_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    const cudaGraphEdgeData *dependencyData;
+    size_t numDependencies;
+    struct cudaGraphNodeParams *nodeParams;
+} cudaGraphAddNode_v2_v12030_params;
+
+typedef struct cudaGraphNodeSetParams_v12020_params_st {
+    cudaGraphNode_t node;
+    struct cudaGraphNodeParams *nodeParams;
+} cudaGraphNodeSetParams_v12020_params;
+
+typedef struct cudaGraphExecNodeSetParams_v12020_params_st {
+    cudaGraphExec_t graphExec;
+    cudaGraphNode_t node;
+    struct cudaGraphNodeParams *nodeParams;
+} cudaGraphExecNodeSetParams_v12020_params;
+
+typedef struct cudaGraphConditionalHandleCreate_v12030_params_st {
+    cudaGraphConditionalHandle *pHandle_out;
+    cudaGraph_t graph;
+    unsigned int defaultLaunchValue;
+    unsigned int flags;
+} cudaGraphConditionalHandleCreate_v12030_params;
+
+typedef struct cudaGetDriverEntryPoint_ptsz_v11030_params_st {
+    const char *symbol;
+    void **funcPtr;
+    unsigned long long flags;
+    enum cudaDriverEntryPointQueryResult *driverStatus;
+} cudaGetDriverEntryPoint_ptsz_v11030_params;
+
+typedef struct cudaGetDriverEntryPointByVersion_ptsz_v12050_params_st {
+    const char *symbol;
+    void **funcPtr;
+    unsigned int cudaVersion;
+    unsigned long long flags;
+    enum cudaDriverEntryPointQueryResult *driverStatus;
+} cudaGetDriverEntryPointByVersion_ptsz_v12050_params;
+
+typedef struct cudaGetFuncBySymbol_v11000_params_st {
+    cudaFunction_t *functionPtr;
+    const void *symbolPtr;
+} cudaGetFuncBySymbol_v11000_params;
+
+typedef struct cudaGetKernel_v12000_params_st {
+    cudaKernel_t *kernelPtr;
+    const void *entryFuncAddr;
+} cudaGetKernel_v12000_params;
+
+typedef struct cudaMemcpy_v3020_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy_v3020_params;
+
+typedef struct cudaMemcpyToSymbol_v3020_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToSymbol_v3020_params;
+
+typedef struct cudaMemcpyFromSymbol_v3020_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromSymbol_v3020_params;
+
+typedef struct cudaMemcpy2D_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2D_v3020_params;
+
+typedef struct cudaMemcpyToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToArray_v3020_params;
+
+typedef struct cudaMemcpy2DToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DToArray_v3020_params;
+
+typedef struct cudaMemcpyFromArray_v3020_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromArray_v3020_params;
+
+typedef struct cudaMemcpy2DFromArray_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DFromArray_v3020_params;
+
+typedef struct cudaMemcpyArrayToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyArrayToArray_v3020_params;
+
+typedef struct cudaMemcpy2DArrayToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DArrayToArray_v3020_params;
+
+typedef struct cudaMemcpy3D_v3020_params_st {
+    const struct cudaMemcpy3DParms *p;
+} cudaMemcpy3D_v3020_params;
+
+typedef struct cudaMemcpy3DPeer_v4000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+} cudaMemcpy3DPeer_v4000_params;
+
+typedef struct cudaMemcpyBatchAsync_v12080_params_st {
+    void **dsts;
+    void **srcs;
+    size_t *sizes;
+    size_t count;
+    struct cudaMemcpyAttributes *attrs;
+    size_t *attrsIdxs;
+    size_t numAttrs;
+    size_t *failIdx;
+    cudaStream_t stream;
+} cudaMemcpyBatchAsync_v12080_params;
+
+typedef struct cudaMemcpy3DBatchAsync_v12080_params_st {
+    size_t numOps;
+    struct cudaMemcpy3DBatchOp *opList;
+    size_t *failIdx;
+    unsigned long long flags;
+    cudaStream_t stream;
+} cudaMemcpy3DBatchAsync_v12080_params;
+
+typedef struct cudaMemset_v3020_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+} cudaMemset_v3020_params;
+
+typedef struct cudaMemset2D_v3020_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+} cudaMemset2D_v3020_params;
+
+typedef struct cudaMemset3D_v3020_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+} cudaMemset3D_v3020_params;
+
+typedef struct cudaMemcpyAsync_v3020_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyAsync_v3020_params;
+
+typedef struct cudaMemcpyToSymbolAsync_v3020_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToSymbolAsync_v3020_params;
+
+typedef struct cudaMemcpyFromSymbolAsync_v3020_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromSymbolAsync_v3020_params;
+
+typedef struct cudaMemcpy2DAsync_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DAsync_v3020_params;
+
+typedef struct cudaMemcpyToArrayAsync_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToArrayAsync_v3020_params;
+
+typedef struct cudaMemcpy2DToArrayAsync_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DToArrayAsync_v3020_params;
+
+typedef struct cudaMemcpyFromArrayAsync_v3020_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromArrayAsync_v3020_params;
+
+typedef struct cudaMemcpy2DFromArrayAsync_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DFromArrayAsync_v3020_params;
+
+typedef struct cudaMemcpy3DAsync_v3020_params_st {
+    const struct cudaMemcpy3DParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DAsync_v3020_params;
+
+typedef struct cudaMemcpy3DPeerAsync_v4000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DPeerAsync_v4000_params;
+
+typedef struct cudaMemsetAsync_v3020_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+    cudaStream_t stream;
+} cudaMemsetAsync_v3020_params;
+
+typedef struct cudaMemset2DAsync_v3020_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+    cudaStream_t stream;
+} cudaMemset2DAsync_v3020_params;
+
+typedef struct cudaMemset3DAsync_v3020_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+    cudaStream_t stream;
+} cudaMemset3DAsync_v3020_params;
+
+typedef struct cudaStreamQuery_v3020_params_st {
+    cudaStream_t stream;
+} cudaStreamQuery_v3020_params;
+
+typedef struct cudaStreamGetDevice_v12080_params_st {
+    cudaStream_t hStream;
+    int *device;
+} cudaStreamGetDevice_v12080_params;
+
+typedef struct cudaStreamGetFlags_v5050_params_st {
+    cudaStream_t hStream;
+    unsigned int *flags;
+} cudaStreamGetFlags_v5050_params;
+
+typedef struct cudaStreamGetId_v12000_params_st {
+    cudaStream_t hStream;
+    unsigned long long *streamId;
+} cudaStreamGetId_v12000_params;
+
+typedef struct cudaStreamGetPriority_v5050_params_st {
+    cudaStream_t hStream;
+    int *priority;
+} cudaStreamGetPriority_v5050_params;
+
+typedef struct cudaEventRecord_v3020_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+} cudaEventRecord_v3020_params;
+
+typedef struct cudaEventRecordWithFlags_v11010_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+    unsigned int flags;
+} cudaEventRecordWithFlags_v11010_params;
+
+typedef struct cudaStreamWaitEvent_v3020_params_st {
+    cudaStream_t stream;
+    cudaEvent_t event;
+    unsigned int flags;
+} cudaStreamWaitEvent_v3020_params;
+
+typedef struct cudaStreamAddCallback_v5000_params_st {
+    cudaStream_t stream;
+    cudaStreamCallback_t callback;
+    void *userData;
+    unsigned int flags;
+} cudaStreamAddCallback_v5000_params;
+
+typedef struct cudaStreamAttachMemAsync_v6000_params_st {
+    cudaStream_t stream;
+    void *devPtr;
+    size_t length;
+    unsigned int flags;
+} cudaStreamAttachMemAsync_v6000_params;
+
+typedef struct cudaStreamSynchronize_v3020_params_st {
+    cudaStream_t stream;
+} cudaStreamSynchronize_v3020_params;
+
+typedef struct cudaLaunchKernel_v7000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchKernel_v7000_params;
+
+typedef struct cudaLaunchKernelExC_v11060_params_st {
+    const cudaLaunchConfig_t *config;
+    const void *func;
+    void **args;
+} cudaLaunchKernelExC_v11060_params;
+
+typedef struct cudaLaunchCooperativeKernel_v9000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchCooperativeKernel_v9000_params;
+
+typedef struct cudaLaunchHostFunc_v10000_params_st {
+    cudaStream_t stream;
+    cudaHostFn_t fn;
+    void *userData;
+} cudaLaunchHostFunc_v10000_params;
+
+typedef struct cudaMemPrefetchAsync_v8000_params_st {
+    const void *devPtr;
+    size_t count;
+    int dstDevice;
+    cudaStream_t stream;
+} cudaMemPrefetchAsync_v8000_params;
+
+typedef struct cudaMemPrefetchAsync_v2_v12020_params_st {
+    const void *devPtr;
+    size_t count;
+    struct cudaMemLocation location;
+    unsigned int flags;
+    cudaStream_t stream;
+} cudaMemPrefetchAsync_v2_v12020_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_v10000_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_ptsz_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_ptsz_v10000_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_v2_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_v2_v11020_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_v10000_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_ptsz_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_ptsz_v10000_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_v2_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_v2_v11020_params;
+
+typedef struct cudaGraphInstantiateWithParams_v12000_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    cudaGraphInstantiateParams *instantiateParams;
+} cudaGraphInstantiateWithParams_v12000_params;
+
+typedef struct cudaGraphUpload_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphUpload_v10000_params;
+
+typedef struct cudaGraphLaunch_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphLaunch_v10000_params;
+
+typedef struct cudaStreamBeginCapture_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureMode mode;
+} cudaStreamBeginCapture_v10000_params;
+
+typedef struct cudaStreamBeginCaptureToGraph_v12030_params_st {
+    cudaStream_t stream;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *dependencies;
+    const cudaGraphEdgeData *dependencyData;
+    size_t numDependencies;
+    enum cudaStreamCaptureMode mode;
+} cudaStreamBeginCaptureToGraph_v12030_params;
+
+typedef struct cudaStreamEndCapture_v10000_params_st {
+    cudaStream_t stream;
+    cudaGraph_t *pGraph;
+} cudaStreamEndCapture_v10000_params;
+
+typedef struct cudaStreamIsCapturing_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *pCaptureStatus;
+} cudaStreamIsCapturing_v10000_params;
+
+typedef struct cudaStreamGetCaptureInfo_v10010_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+} cudaStreamGetCaptureInfo_v10010_params;
+
+typedef struct cudaStreamGetCaptureInfo_ptsz_v10010_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+} cudaStreamGetCaptureInfo_ptsz_v10010_params;
+
+typedef struct cudaStreamGetCaptureInfo_v2_v11030_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+    cudaGraph_t *graph_out;
+    const cudaGraphNode_t **dependencies_out;
+    size_t *numDependencies_out;
+} cudaStreamGetCaptureInfo_v2_v11030_params;
+
+typedef struct cudaStreamGetCaptureInfo_v3_v12030_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+    cudaGraph_t *graph_out;
+    const cudaGraphNode_t **dependencies_out;
+    const cudaGraphEdgeData **edgeData_out;
+    size_t *numDependencies_out;
+} cudaStreamGetCaptureInfo_v3_v12030_params;
+
+typedef struct cudaStreamUpdateCaptureDependencies_v11030_params_st {
+    cudaStream_t stream;
+    cudaGraphNode_t *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cudaStreamUpdateCaptureDependencies_v11030_params;
+
+typedef struct cudaStreamUpdateCaptureDependencies_v2_v12030_params_st {
+    cudaStream_t stream;
+    cudaGraphNode_t *dependencies;
+    const cudaGraphEdgeData *dependencyData;
+    size_t numDependencies;
+    unsigned int flags;
+} cudaStreamUpdateCaptureDependencies_v2_v12030_params;
+
+typedef struct cudaStreamCopyAttributes_v11000_params_st {
+    cudaStream_t dstStream;
+    cudaStream_t srcStream;
+} cudaStreamCopyAttributes_v11000_params;
+
+typedef struct cudaStreamGetAttribute_v11000_params_st {
+    cudaStream_t stream;
+    cudaStreamAttrID attr;
+    cudaStreamAttrValue *value;
+} cudaStreamGetAttribute_v11000_params;
+
+typedef struct cudaStreamSetAttribute_v11000_params_st {
+    cudaStream_t stream;
+    cudaStreamAttrID attr;
+    const cudaStreamAttrValue *param;
+} cudaStreamSetAttribute_v11000_params;
+
+typedef struct cudaMallocAsync_v11020_params_st {
+    void **devPtr;
+    size_t size;
+    cudaStream_t hStream;
+} cudaMallocAsync_v11020_params;
+
+typedef struct cudaFreeAsync_v11020_params_st {
+    void *devPtr;
+    cudaStream_t hStream;
+} cudaFreeAsync_v11020_params;
+
+typedef struct cudaMallocFromPoolAsync_v11020_params_st {
+    void **ptr;
+    size_t size;
+    cudaMemPool_t memPool;
+    cudaStream_t stream;
+} cudaMallocFromPoolAsync_v11020_params;
+
+typedef struct cudaGetDriverEntryPoint_v11030_params_st {
+    const char *symbol;
+    void **funcPtr;
+    unsigned long long flags;
+    enum cudaDriverEntryPointQueryResult *driverStatus;
+} cudaGetDriverEntryPoint_v11030_params;
+
+typedef struct cudaGetDriverEntryPointByVersion_v12050_params_st {
+    const char *symbol;
+    void **funcPtr;
+    unsigned int cudaVersion;
+    unsigned long long flags;
+    enum cudaDriverEntryPointQueryResult *driverStatus;
+} cudaGetDriverEntryPointByVersion_v12050_params;
+
+typedef struct cudaGetDeviceProperties_v3020_params_st {
+    struct cudaDeviceProp *prop;
+    int device;
+} cudaGetDeviceProperties_v3020_params;
+
+// Parameter trace structures for removed functions
+
+
+// End of parameter trace structures
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_vdpau_interop_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_vdpau_interop_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..88e79d1957925c4bbacd381e9461d5072de88f24
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cuda_vdpau_interop_meta.h
@@ -0,0 +1,38 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// CUDA public interface, for type definitions and api function prototypes
+#include "cuda_vdpau_interop.h"
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+// Currently used parameter trace structures
+typedef struct cudaVDPAUGetDevice_v3020_params_st {
+    int *device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cudaVDPAUGetDevice_v3020_params;
+
+typedef struct cudaVDPAUSetVDPAUDevice_v3020_params_st {
+    int device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cudaVDPAUSetVDPAUDevice_v3020_params;
+
+typedef struct cudaGraphicsVDPAURegisterVideoSurface_v3020_params_st {
+    struct cudaGraphicsResource **resource;
+    VdpVideoSurface vdpSurface;
+    unsigned int flags;
+} cudaGraphicsVDPAURegisterVideoSurface_v3020_params;
+
+typedef struct cudaGraphicsVDPAURegisterOutputSurface_v3020_params_st {
+    struct cudaGraphicsResource **resource;
+    VdpOutputSurface vdpSurface;
+    unsigned int flags;
+} cudaGraphicsVDPAURegisterOutputSurface_v3020_params;
+
+// Parameter trace structures for removed functions
+
+
+// End of parameter trace structures
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudart_removed_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudart_removed_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0fc27a71bb3fc883db9fe7562eea3f28145430d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudart_removed_meta.h
@@ -0,0 +1,162 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// CUDA public interface, for type definitions and api function prototypes
+#include "cudart_removed.h"
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+// Currently used parameter trace structures
+typedef struct cudaStreamDestroy_v3020_params_st {
+    cudaStream_t stream;
+} cudaStreamDestroy_v3020_params;
+
+typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000_params_st {
+    int *numBlocks;
+    const void *func;
+    size_t numDynamicSmemBytes;
+} cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000_params;
+
+typedef struct cudaConfigureCall_v3020_params_st {
+    dim3 gridDim;
+    dim3 blockDim;
+    size_t sharedMem  __dv;
+    cudaStream_t stream  __dv;
+} cudaConfigureCall_v3020_params;
+
+typedef struct cudaSetupArgument_v3020_params_st {
+    const void *arg;
+    size_t size;
+    size_t offset;
+} cudaSetupArgument_v3020_params;
+
+typedef struct cudaLaunch_v3020_params_st {
+    const void *func;
+} cudaLaunch_v3020_params;
+
+typedef struct cudaLaunch_ptsz_v7000_params_st {
+    const void *func;
+} cudaLaunch_ptsz_v7000_params;
+
+typedef struct cudaStreamSetFlags_v10200_params_st {
+    cudaStream_t hStream;
+    unsigned int flags;
+} cudaStreamSetFlags_v10200_params;
+
+typedef struct cudaStreamSetFlags_ptsz_v10200_params_st {
+    cudaStream_t hStream;
+    unsigned int flags;
+} cudaStreamSetFlags_ptsz_v10200_params;
+
+typedef struct cudaProfilerInitialize_v4000_params_st {
+    const char *configFile;
+    const char *outputFile;
+    cudaOutputMode_t outputMode;
+} cudaProfilerInitialize_v4000_params;
+
+typedef struct cudaThreadSetLimit_v3020_params_st {
+    enum cudaLimit limit;
+    size_t value;
+} cudaThreadSetLimit_v3020_params;
+
+typedef struct cudaThreadGetLimit_v3020_params_st {
+    size_t *pValue;
+    enum cudaLimit limit;
+} cudaThreadGetLimit_v3020_params;
+
+typedef struct cudaThreadGetCacheConfig_v3020_params_st {
+    enum cudaFuncCache *pCacheConfig;
+} cudaThreadGetCacheConfig_v3020_params;
+
+typedef struct cudaThreadSetCacheConfig_v3020_params_st {
+    enum cudaFuncCache cacheConfig;
+} cudaThreadSetCacheConfig_v3020_params;
+
+typedef struct cudaSetDoubleForDevice_v3020_params_st {
+    double *d;
+} cudaSetDoubleForDevice_v3020_params;
+
+typedef struct cudaSetDoubleForHost_v3020_params_st {
+    double *d;
+} cudaSetDoubleForHost_v3020_params;
+
+typedef struct cudaCreateTextureObject_v2_v11080_params_st {
+    cudaTextureObject_t *pTexObject;
+    const struct cudaResourceDesc *pResDesc;
+    const struct cudaTextureDesc *pTexDesc;
+    const struct cudaResourceViewDesc *pResViewDesc;
+} cudaCreateTextureObject_v2_v11080_params;
+
+typedef struct cudaGetTextureObjectTextureDesc_v2_v11080_params_st {
+    struct cudaTextureDesc *pTexDesc;
+    cudaTextureObject_t texObject;
+} cudaGetTextureObjectTextureDesc_v2_v11080_params;
+
+typedef struct cudaBindTexture_v3020_params_st {
+    size_t *offset;
+    const struct textureReference *texref;
+    const void *devPtr;
+    const struct cudaChannelFormatDesc *desc;
+    size_t size  __dv;
+} cudaBindTexture_v3020_params;
+
+typedef struct cudaBindTexture2D_v3020_params_st {
+    size_t *offset;
+    const struct textureReference *texref;
+    const void *devPtr;
+    const struct cudaChannelFormatDesc *desc;
+    size_t width;
+    size_t height;
+    size_t pitch;
+} cudaBindTexture2D_v3020_params;
+
+typedef struct cudaBindTextureToArray_v3020_params_st {
+    const struct textureReference *texref;
+    cudaArray_const_t array;
+    const struct cudaChannelFormatDesc *desc;
+} cudaBindTextureToArray_v3020_params;
+
+typedef struct cudaBindTextureToMipmappedArray_v5000_params_st {
+    const struct textureReference *texref;
+    cudaMipmappedArray_const_t mipmappedArray;
+    const struct cudaChannelFormatDesc *desc;
+} cudaBindTextureToMipmappedArray_v5000_params;
+
+typedef struct cudaUnbindTexture_v3020_params_st {
+    const struct textureReference *texref;
+} cudaUnbindTexture_v3020_params;
+
+typedef struct cudaGetTextureAlignmentOffset_v3020_params_st {
+    size_t *offset;
+    const struct textureReference *texref;
+} cudaGetTextureAlignmentOffset_v3020_params;
+
+typedef struct cudaGetTextureReference_v3020_params_st {
+    const struct textureReference **texref;
+    const void *symbol;
+} cudaGetTextureReference_v3020_params;
+
+typedef struct cudaBindSurfaceToArray_v3020_params_st {
+    const struct surfaceReference *surfref;
+    cudaArray_const_t array;
+    const struct cudaChannelFormatDesc *desc;
+} cudaBindSurfaceToArray_v3020_params;
+
+typedef struct cudaGetSurfaceReference_v3020_params_st {
+    const struct surfaceReference **surfref;
+    const void *symbol;
+} cudaGetSurfaceReference_v3020_params;
+
+typedef struct cudaGraphInstantiate_v10000_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    cudaGraphNode_t *pErrorNode;
+    char *pLogBuffer;
+    size_t bufferSize;
+} cudaGraphInstantiate_v10000_params;
+
+// Parameter trace structures for removed functions
+
+
+// End of parameter trace structures
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_nvtx_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_nvtx_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed8877e21f0651fe1564151090850694eb495cfb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_nvtx_meta.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2013-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+typedef struct nvtxMarkEx_params_st {
+  const nvtxEventAttributes_t* eventAttrib;
+} nvtxMarkEx_params;
+
+typedef struct nvtxMarkA_params_st {
+  const char* message;
+} nvtxMarkA_params;
+
+typedef struct nvtxMarkW_params_st {
+  const wchar_t* message;
+} nvtxMarkW_params;
+
+typedef struct nvtxRangeStartEx_params_st {
+  const nvtxEventAttributes_t* eventAttrib;
+} nvtxRangeStartEx_params;
+
+typedef struct nvtxRangeStartA_params_st {
+  const char* message;
+} nvtxRangeStartA_params;
+
+typedef struct nvtxRangeStartW_params_st {
+  const wchar_t* message;
+} nvtxRangeStartW_params;
+
+typedef struct nvtxRangeEnd_params_st {
+  nvtxRangeId_t id;
+} nvtxRangeEnd_params;
+
+typedef struct nvtxRangePushEx_params_st {
+  const nvtxEventAttributes_t* eventAttrib;
+} nvtxRangePushEx_params;
+
+typedef struct nvtxRangePushA_params_st {
+  const char* message;
+} nvtxRangePushA_params;
+
+typedef struct nvtxRangePushW_params_st {
+  const wchar_t* message;
+} nvtxRangePushW_params;
+
+typedef struct nvtxRangePop_params_st {
+  /* WAR: Windows compiler doesn't allow empty structs */
+  /* This field shouldn't be used */
+  void *dummy;
+} nvtxRangePop_params;
+
+typedef struct nvtxNameCategoryA_params_st {
+  uint32_t category;
+  const char* name;
+} nvtxNameCategoryA_params;
+
+typedef struct nvtxNameCategoryW_params_st {
+  uint32_t category;
+  const wchar_t* name;
+} nvtxNameCategoryW_params;
+
+typedef struct nvtxNameOsThreadA_params_st {
+  uint32_t threadId;
+  const char* name;
+} nvtxNameOsThreadA_params;
+
+typedef struct nvtxNameOsThreadW_params_st {
+  uint32_t threadId;
+  const wchar_t* name;
+} nvtxNameOsThreadW_params;
+
+typedef struct nvtxNameCuDeviceA_params_st {
+  CUdevice device;
+  const char* name;
+} nvtxNameCuDeviceA_params;
+
+typedef struct nvtxNameCuDeviceW_params_st {
+  CUdevice device;
+  const wchar_t* name;
+} nvtxNameCuDeviceW_params;
+
+typedef struct nvtxNameCuContextA_params_st {
+  CUcontext context;
+  const char* name;
+} nvtxNameCuContextA_params;
+
+typedef struct nvtxNameCuContextW_params_st {
+  CUcontext context;
+  const wchar_t* name;
+} nvtxNameCuContextW_params;
+
+typedef struct nvtxNameCuStreamA_params_st {
+  CUstream stream;
+  const char* name;
+} nvtxNameCuStreamA_params;
+
+typedef struct nvtxNameCuStreamW_params_st {
+  CUstream stream;
+  const wchar_t* name;
+} nvtxNameCuStreamW_params;
+
+typedef struct nvtxNameCuEventA_params_st {
+  CUevent event;
+  const char* name;
+} nvtxNameCuEventA_params;
+
+typedef struct nvtxNameCuEventW_params_st {
+  CUevent event;
+  const wchar_t* name;
+} nvtxNameCuEventW_params;
+
+typedef struct nvtxNameCudaDeviceA_params_st {
+  int device;
+  const char* name;
+} nvtxNameCudaDeviceA_params;
+
+typedef struct nvtxNameCudaDeviceW_params_st {
+  int device;
+  const wchar_t* name;
+} nvtxNameCudaDeviceW_params;
+
+typedef struct nvtxNameCudaStreamA_params_st {
+  cudaStream_t stream;
+  const char* name;
+} nvtxNameCudaStreamA_params;
+
+typedef struct nvtxNameCudaStreamW_params_st {
+  cudaStream_t stream;
+  const wchar_t* name;
+} nvtxNameCudaStreamW_params;
+
+typedef struct nvtxNameCudaEventA_params_st {
+  cudaEvent_t event;
+  const char* name;
+} nvtxNameCudaEventA_params;
+
+typedef struct nvtxNameCudaEventW_params_st {
+  cudaEvent_t event;
+  const wchar_t* name;
+} nvtxNameCudaEventW_params;
+
+typedef struct nvtxDomainCreateA_params_st {
+  const char* name;
+} nvtxDomainCreateA_params;
+
+typedef struct nvtxDomainDestroy_params_st {
+  nvtxDomainHandle_t domain;
+} nvtxDomainDestroy_params;
+
+typedef struct nvtxDomainMarkEx_params_st {
+  nvtxDomainHandle_t domain;
+  nvtxMarkEx_params core;
+} nvtxDomainMarkEx_params;
+
+typedef struct nvtxDomainRangeStartEx_params_st {
+  nvtxDomainHandle_t domain;
+  nvtxRangeStartEx_params core;
+} nvtxDomainRangeStartEx_params;
+
+typedef struct nvtxDomainRangeEnd_params_st {
+  nvtxDomainHandle_t domain;
+  nvtxRangeEnd_params core;
+} nvtxDomainRangeEnd_params;
+
+typedef struct nvtxDomainRangePushEx_params_st {
+  nvtxDomainHandle_t domain;
+  nvtxRangePushEx_params core;
+} nvtxDomainRangePushEx_params;
+
+typedef struct nvtxDomainRangePop_params_st {
+  nvtxDomainHandle_t domain;
+} nvtxDomainRangePop_params;
+
+typedef struct nvtxSyncUserCreate_params_st {
+  nvtxDomainHandle_t domain;
+  const nvtxSyncUserAttributes_t* attribs;
+} nvtxSyncUserCreate_params;
+
+typedef struct nvtxSyncUserCommon_params_st {
+  nvtxSyncUser_t handle;
+} nvtxSyncUserCommon_params;
+
+typedef struct nvtxDomainRegisterStringA_params_st {
+    nvtxDomainHandle_t domain;
+    const char* string;
+} nvtxDomainRegisterStringA_params;
+
+typedef struct nvtxDomainRegisterStringW_params_st {
+    nvtxDomainHandle_t domain;
+    const char* string;
+} nvtxDomainRegisterStringW_params;
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/host_config.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/host_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..785bec4e5c0652f9605ccf9341b7f761a85471ab
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/host_config.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("host_config.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "host_config.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
+#endif
+
+#include "crt/host_config.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/host_defines.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/host_defines.h
new file mode 100644
index 0000000000000000000000000000000000000000..98a9c98a957e8f60e872b94fde762516c5523367
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/host_defines.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("host_defines.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "host_defines.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
+#endif
+
+#include "crt/host_defines.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/math_constants.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/math_constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..39937e980f88a614d847154f9e4364bd9ba95cbd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/math_constants.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__MATH_CONSTANTS_H__)
+#define __MATH_CONSTANTS_H__
+
+/* single precision constants */
+#define CUDART_INF_F            __int_as_float(0x7f800000U)
+#define CUDART_NAN_F            __int_as_float(0x7fffffffU)
+#define CUDART_MIN_DENORM_F     __int_as_float(0x00000001U)
+#define CUDART_MAX_NORMAL_F     __int_as_float(0x7f7fffffU)
+#define CUDART_NEG_ZERO_F       __int_as_float(0x80000000U)
+#define CUDART_ZERO_F           0.0F
+#define CUDART_ONE_F            1.0F
+#define CUDART_SQRT_HALF_F      0.707106781F
+#define CUDART_SQRT_HALF_HI_F   0.707106781F
+#define CUDART_SQRT_HALF_LO_F   1.210161749e-08F
+#define CUDART_SQRT_TWO_F       1.414213562F
+#define CUDART_THIRD_F          0.333333333F
+#define CUDART_PIO4_F           0.785398163F
+#define CUDART_PIO2_F           1.570796327F
+#define CUDART_3PIO4_F          2.356194490F
+#define CUDART_2_OVER_PI_F      0.636619772F
+#define CUDART_SQRT_2_OVER_PI_F 0.797884561F
+#define CUDART_PI_F             3.141592654F
+#define CUDART_L2E_F            1.442695041F
+#define CUDART_L2T_F            3.321928094F
+#define CUDART_LG2_F            0.301029996F
+#define CUDART_LGE_F            0.434294482F
+#define CUDART_LN2_F            0.693147181F
+#define CUDART_LNT_F            2.302585093F
+#define CUDART_LNPI_F           1.144729886F
+#define CUDART_TWO_TO_M126_F    1.175494351e-38F
+#define CUDART_TWO_TO_126_F     8.507059173e37F
+#define CUDART_NORM_HUGE_F      3.402823466e38F
+#define CUDART_TWO_TO_23_F      8388608.0F
+#define CUDART_TWO_TO_24_F      16777216.0F
+#define CUDART_TWO_TO_31_F      2147483648.0F
+#define CUDART_TWO_TO_32_F      4294967296.0F
+#define CUDART_REMQUO_BITS_F    3U
+#define CUDART_REMQUO_MASK_F    (~((~0U)<<CUDART_REMQUO_BITS_F))
+#define CUDART_TRIG_PLOSS_F     105615.0F
+
+/* double precision constants */
+#define CUDART_INF              __longlong_as_double(0x7ff0000000000000ULL)
+#define CUDART_NAN              __longlong_as_double(0xfff8000000000000ULL)
+#define CUDART_NEG_ZERO         __longlong_as_double(0x8000000000000000ULL)
+#define CUDART_MIN_DENORM       __longlong_as_double(0x0000000000000001ULL)
+#define CUDART_ZERO             0.0
+#define CUDART_ONE              1.0
+#define CUDART_SQRT_TWO         1.4142135623730951e+0
+#define CUDART_SQRT_HALF        7.0710678118654757e-1
+#define CUDART_SQRT_HALF_HI     7.0710678118654757e-1
+#define CUDART_SQRT_HALF_LO   (-4.8336466567264567e-17)
+#define CUDART_THIRD            3.3333333333333333e-1
+#define CUDART_TWOTHIRD         6.6666666666666667e-1
+#define CUDART_PIO4             7.8539816339744828e-1
+#define CUDART_PIO4_HI          7.8539816339744828e-1
+#define CUDART_PIO4_LO          3.0616169978683830e-17
+#define CUDART_PIO2             1.5707963267948966e+0
+#define CUDART_PIO2_HI          1.5707963267948966e+0
+#define CUDART_PIO2_LO          6.1232339957367660e-17
+#define CUDART_3PIO4            2.3561944901923448e+0
+#define CUDART_2_OVER_PI        6.3661977236758138e-1
+#define CUDART_PI               3.1415926535897931e+0
+#define CUDART_PI_HI            3.1415926535897931e+0
+#define CUDART_PI_LO            1.2246467991473532e-16
+#define CUDART_SQRT_2PI         2.5066282746310007e+0
+#define CUDART_SQRT_2PI_HI      2.5066282746310007e+0
+#define CUDART_SQRT_2PI_LO    (-1.8328579980459167e-16)
+#define CUDART_SQRT_PIO2        1.2533141373155003e+0
+#define CUDART_SQRT_PIO2_HI     1.2533141373155003e+0
+#define CUDART_SQRT_PIO2_LO   (-9.1642899902295834e-17)
+#define CUDART_SQRT_2OPI        7.9788456080286536e-1
+#define CUDART_L2E              1.4426950408889634e+0
+#define CUDART_L2E_HI           1.4426950408889634e+0
+#define CUDART_L2E_LO           2.0355273740931033e-17
+#define CUDART_L2T              3.3219280948873622e+0
+#define CUDART_LG2              3.0102999566398120e-1
+#define CUDART_LG2_HI           3.0102999566398120e-1
+#define CUDART_LG2_LO         (-2.8037281277851704e-18)
+#define CUDART_LGE              4.3429448190325182e-1
+#define CUDART_LGE_HI           4.3429448190325182e-1
+#define CUDART_LGE_LO           1.09831965021676510e-17
+#define CUDART_LN2              6.9314718055994529e-1
+#define CUDART_LN2_HI           6.9314718055994529e-1
+#define CUDART_LN2_LO           2.3190468138462996e-17
+#define CUDART_LNT              2.3025850929940459e+0
+#define CUDART_LNT_HI           2.3025850929940459e+0
+#define CUDART_LNT_LO         (-2.1707562233822494e-16)
+#define CUDART_LNPI             1.1447298858494002e+0
+#define CUDART_LN2_X_1024       7.0978271289338397e+2
+#define CUDART_LN2_X_1025       7.1047586007394398e+2
+#define CUDART_LN2_X_1075       7.4513321910194122e+2
+#define CUDART_LG2_X_1024       3.0825471555991675e+2
+#define CUDART_LG2_X_1075       3.2360724533877976e+2
+#define CUDART_TWO_TO_23        8388608.0
+#define CUDART_TWO_TO_52        4503599627370496.0
+#define CUDART_TWO_TO_53        9007199254740992.0
+#define CUDART_TWO_TO_54        18014398509481984.0
+#define CUDART_TWO_TO_M54       5.5511151231257827e-17
+#define CUDART_TWO_TO_M1022     2.22507385850720140e-308
+#define CUDART_TRIG_PLOSS       2147483648.0
+#define CUDART_DBL2INT_CVT      6755399441055744.0
+
+#endif /* !__MATH_CONSTANTS_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/math_functions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/math_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc806976784e494edc905d8b8bd9ad138054bbea
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/math_functions.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("math_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "math_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__
+#endif
+
+#include "crt/math_functions.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/mma.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/mma.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f36f671c0b3a4e95cbb7bddbe41e75ac783b722
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/mma.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
+#endif
+
+#include "crt/mma.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
+#endif
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/nvPTXCompiler.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/nvPTXCompiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..5eeac41b766cdecf8b38552578a21cb6dbfa4fd0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/nvPTXCompiler.h
@@ -0,0 +1,328 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#ifndef nvPTXCompiler_INCLUDED
+#define nvPTXCompiler_INCLUDED
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* --- Dependency --- */
+#include <stddef.h> /* For size_t */
+
+/*************************************************************************/ /**
+ *
+ * \defgroup handle PTX-Compiler Handle
+ *
+ ****************************************************************************/
+
+
+/**
+ * \ingroup handle
+ * \brief   nvPTXCompilerHandle represents a handle to the PTX Compiler.
+ *
+ * To compile a PTX program string, an instance of nvPTXCompiler
+ * must be created and the handle to it must be obtained using the
+ * API nvPTXCompilerCreate(). Then the compilation can be done
+ * using the API nvPTXCompilerCompile().
+ *
+ */
+typedef struct nvPTXCompiler *nvPTXCompilerHandle;
+
+/**
+ *
+ * \defgroup error Error codes
+ *
+ */
+
+/** \ingroup error
+ *
+ * \brief     The nvPTXCompiler APIs return the nvPTXCompileResult codes to indicate the call result
+ */
+
+typedef enum {
+
+    /* Indicates the API completed successfully */
+    NVPTXCOMPILE_SUCCESS                              = 0,
+
+    /* Indicates an invalid nvPTXCompilerHandle was passed to the API */
+    NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE        = 1,
+
+    /* Indicates invalid inputs were given to the API  */
+    NVPTXCOMPILE_ERROR_INVALID_INPUT                  = 2,
+
+    /* Indicates that the compilation of the PTX program failed */
+    NVPTXCOMPILE_ERROR_COMPILATION_FAILURE            = 3,
+
+    /* Indicates that something went wrong internally */
+    NVPTXCOMPILE_ERROR_INTERNAL                       = 4,
+
+    /* Indicates that the API was unable to allocate memory */
+    NVPTXCOMPILE_ERROR_OUT_OF_MEMORY                  = 5,
+
+    /* Indicates that the handle was passed to an API which expected */
+    /* the nvPTXCompilerCompile() to have been called previously */
+    NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE = 6,
+
+    /* Indicates that the PTX version encountered in the PTX is not */
+    /* supported by the current compiler */
+    NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION        = 7,
+
+    /* Indicates that device side sync is not supported by the SM version */
+    NVPTXCOMPILE_ERROR_UNSUPPORTED_DEVSIDE_SYNC       = 8,
+
+    /* Indicates that compilation has been cancelled by the user */
+    NVPTXCOMPILE_ERROR_CANCELLED                      = 9,
+} nvPTXCompileResult;
+
+/* ----------------------------- PTX Compiler APIs ---------------------------- */
+
+/**
+ *
+ * \defgroup versioning API Versioning
+ *
+ * The PTX compiler APIs are versioned so that any new features or API
+ * changes can be done by bumping up the API version.
+ */
+
+/** \ingroup versioning
+ *
+ * \brief            Queries the current \p major and \p minor version of
+ *                   PTX Compiler APIs being used
+ *
+ * \param            [out] major   Major version of the PTX Compiler APIs
+ * \param            [out] minor   Minor version of the PTX Compiler APIs
+ * \note                           The version of PTX Compiler APIs follows the CUDA Toolkit versioning.
+ *                                 The PTX ISA version supported by a PTX Compiler API version is listed
+ *                                 <a href="https://docs.nvidia.com/cuda/parallel-thread-execution/#release-notes">here</a>.
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ */
+nvPTXCompileResult nvPTXCompilerGetVersion(unsigned int *major, unsigned int *minor);
+
+/**
+ *
+ * \defgroup compilation Compilation APIs
+ *
+ */
+
+/** \ingroup compilation
+ *
+ * \brief            Obtains the handle to an instance of the PTX compiler
+ *                   initialized with the given PTX program \p ptxCode
+ *
+ * \param            [out] compiler  Returns a handle to PTX compiler initialized
+ *                                   with the PTX program \p ptxCode
+ * \param            [in] ptxCodeLen Size of the PTX program \p ptxCode passed as string
+ * \param            [in] ptxCode    The PTX program which is to be compiled passed as string.
+ *
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ */
+nvPTXCompileResult nvPTXCompilerCreate(nvPTXCompilerHandle *compiler, size_t ptxCodeLen, const char *ptxCode);
+
+/** \ingroup compilation
+ *
+ * \brief            Destroys and cleans the already created PTX compiler
+ *
+ * \param            [in] compiler  A handle to the PTX compiler which is to be destroyed
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *
+ */
+nvPTXCompileResult nvPTXCompilerDestroy(nvPTXCompilerHandle *compiler);
+
+/** \ingroup compilation
+ *
+ * \brief          Compile a PTX program with the given compiler options
+ *
+ * \param            [in,out] compiler      A handle to PTX compiler initialized with the
+ *                                          PTX program which is to be compiled.
+ *                                          The compiled program can be accessed using the handle
+ * \param            [in] numCompileOptions Length of the array \p compileOptions
+ * \param            [in] compileOptions   Compiler options with which compilation should be done.
+ *                                         The compiler options string is a null terminated character array.
+ *                                         A valid list of compiler options is at
+ *                                         <a href="http://docs.nvidia.com/cuda/ptx-compiler-api/index.html#compile-options">link</a>.
+ * \note                                   --gpu-name (-arch) is a mandatory option.
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_OUT_OF_MEMORY \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILATION_FAILURE  \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_UNSUPPORTED_PTX_VERSION  \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_UNSUPPORTED_DEVSIDE_SYNC \endlink
+ *
+ */
+nvPTXCompileResult nvPTXCompilerCompile(nvPTXCompilerHandle compiler, int numCompileOptions,
+                                        const char *const *compileOptions);
+
+/** \ingroup compilation
+ *
+ * \brief            Obtains the size of the image of the compiled program
+ *
+ * \param            [in] compiler          A handle to PTX compiler on which nvPTXCompilerCompile() has been performed.
+ * \param            [out] binaryImageSize  The size of the image of the compiled program
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE \endlink
+ *
+ * \note             nvPTXCompilerCompile() API should be invoked for the handle before calling this API.
+ *                   Otherwise, NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE is returned.
+ */
+nvPTXCompileResult nvPTXCompilerGetCompiledProgramSize(nvPTXCompilerHandle compiler, size_t *binaryImageSize);
+
+/** \ingroup compilation
+ *
+ * \brief            Obtains the image of the compiled program
+ *
+ * \param            [in] compiler          A handle to PTX compiler on which nvPTXCompilerCompile() has been performed.
+ * \param            [out] binaryImage      The image of the compiled program.
+ *                                         Client should allocate memory for \p binaryImage
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE \endlink
+ *
+ * \note             nvPTXCompilerCompile() API should be invoked for the handle before calling this API.
+ *                   Otherwise, NVPTXCOMPILE_ERROR_COMPILER_INVOCATION_INCOMPLETE is returned.
+ *
+ */
+
+nvPTXCompileResult nvPTXCompilerGetCompiledProgram(nvPTXCompilerHandle compiler, void *binaryImage);
+
+/** \ingroup compilation
+ *
+ * \brief            Query the size of the error message that was seen previously for the handle
+ *
+ * \param            [in] compiler          A handle to PTX compiler on which nvPTXCompilerCompile() has been performed.
+ * \param            [out] errorLogSize     The size of the error log in bytes which was produced
+ *                                          in previous call to nvPTXCompilerCompiler().
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *
+ */
+nvPTXCompileResult nvPTXCompilerGetErrorLogSize(nvPTXCompilerHandle compiler, size_t *errorLogSize);
+
+/** \ingroup compilation
+ *
+ * \brief            Query the error message that was seen previously for the handle
+ *
+ * \param            [in] compiler         A handle to PTX compiler on which nvPTXCompilerCompile() has been performed.
+ * \param            [out] errorLog        The error log which was produced in previous call to nvPTXCompilerCompiler().
+ *                                         Clients should allocate memory for \p errorLog
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *
+ */
+nvPTXCompileResult nvPTXCompilerGetErrorLog(nvPTXCompilerHandle compiler, char *errorLog);
+
+/** \ingroup compilation
+ *
+ * \brief            Query the size of the information message that was seen previously for the handle
+ *
+ * \param            [in] compiler        A handle to PTX compiler on which nvPTXCompilerCompile() has been performed.
+ * \param            [out] infoLogSize    The size of the information log in bytes which was produced
+ *                                         in previous call to nvPTXCompilerCompiler().
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *
+ */
+nvPTXCompileResult nvPTXCompilerGetInfoLogSize(nvPTXCompilerHandle compiler, size_t *infoLogSize);
+
+/** \ingroup compilation
+ *
+ * \brief           Query the information message that was seen previously for the handle
+ *
+ * \param            [in] compiler        A handle to PTX compiler on which nvPTXCompilerCompile() has been performed.
+ * \param            [out] infoLog        The information log which was produced in previous call to nvPTXCompilerCompiler().
+ *                                        Clients should allocate memory for \p infoLog
+ *
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INTERNAL \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *
+ */
+nvPTXCompileResult nvPTXCompilerGetInfoLog(nvPTXCompilerHandle compiler, char *infoLog);
+
+/** \ingroup compilation
+ *
+ * \brief           Register a callback function that the compiler will invoke at different phases of
+ *                  PTX Compilation during a call to nvPTXCompilerCompile().
+ *                  The callback function decides to cancel the compilation by returning specific values.
+ *
+ *                  Callback function must satisfy the following constraints
+ *                  (1) Its signature should be
+ *                      @code
+ *                      int callback(void* param1, void* param2);
+ *                      @endcode
+ *                      When invoking the callback, the compiler will always pass \p payload to
+ *                      param1 so that the callback may make decisions based on \p payload . It'll
+ *                      always pass NULL to param2 for now which is reserved for future extensions.
+ *
+ *                  (2) It must return 1 to cancel compilation or 0 to continue.
+ *                      Other return values are reserved for future use.
+ *
+ *                  (3) It must return consistent values. Once it returns 1 at one point, it must
+ *                      return 1 in all following invocations during the current nvPTXCompilerCompile
+ *                      call in progress.
+ *
+ *                  (4) It must be thread-safe.
+ *
+ *                  (5) It must not invoke any nvrtc/libnvvm/ptx APIs.
+ *
+ * \param            [in] compiler        A handle to an initialized PTX compiler in which to introduce the callback.
+ * \param            [in] callback        Function pointer to the callback function.
+ * \param            [in] payload         payload to be passed as a parameter when invoking the callback.
+ * \return
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_SUCCESS \endlink
+ *     \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvPTXCompileResult NVPTXCOMPILE_ERROR_INVALID_COMPILER_HANDLE \endlink
+ *
+ */
+nvPTXCompileResult nvPTXCompilerSetFlowCallback(nvPTXCompilerHandle compiler, int (*callback)(void *, void *),
+                                                void *payload);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // nvPTXCompiler_INCLUDED
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_common.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ed01f7bc2851f43678e58efe34fc5579cca3a35
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_common.h
@@ -0,0 +1,393 @@
+#ifndef NVPERF_COMMON_H
+#define NVPERF_COMMON_H
+
+/*
+ * Copyright 2014-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * This software and the information contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+ * of a form of NVIDIA software license agreement.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility push(default)
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
+    #endif
+#else
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL
+    #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ *  @file   nvperf_common.h
+ */
+
+#ifndef NVPERF_NVPA_STATUS_DEFINED
+#define NVPERF_NVPA_STATUS_DEFINED
+
+    /// Error codes.
+    typedef enum NVPA_Status
+    {
+        /// Success
+        NVPA_STATUS_SUCCESS = 0,
+        /// Generic error.
+        NVPA_STATUS_ERROR = 1,
+        /// Internal error.  Please file a bug!
+        NVPA_STATUS_INTERNAL_ERROR = 2,
+        /// NVPW_InitializeTarget() or NVPW_InitializeHost() has not been called yet.
+        NVPA_STATUS_NOT_INITIALIZED = 3,
+        /// The NvPerf DLL/DSO could not be loaded during NVPW_Initialize*(). Please ensure they are placed in the
+        /// appropriate location that can be founder by a dynamic linker. And on Linux systems, confirm that the
+        /// LD_LIBRARY_PATH environment variable is set correctly. Alternatively, you may utilize
+        /// NVPW_SetLibraryLoadPaths() to define additional library search paths.
+        NVPA_STATUS_NOT_LOADED = 4,
+        /// The function was not found in this version of the NvPerf DLL/DSO. Or if you are directly calling
+        /// NVPA_GetProcAddress(), please ensure the function name is spelled correctly.
+        NVPA_STATUS_FUNCTION_NOT_FOUND = 5,
+        /// The request was intentionally not supported.
+        NVPA_STATUS_NOT_SUPPORTED = 6,
+        /// The request was not implemented by this version.
+        NVPA_STATUS_NOT_IMPLEMENTED = 7,
+        /// Invalid argument.
+        NVPA_STATUS_INVALID_ARGUMENT = 8,
+        /// UNUSED
+        NVPA_STATUS_INVALID_METRIC_ID = 9,
+        /// No driver has been loaded via NVPW_*_LoadDriver().
+        NVPA_STATUS_DRIVER_NOT_LOADED = 10,
+        /// Failed memory allocation.
+        NVPA_STATUS_OUT_OF_MEMORY = 11,
+        /// UNUSED
+        NVPA_STATUS_INVALID_THREAD_STATE = 12,
+        /// UNUSED
+        NVPA_STATUS_FAILED_CONTEXT_ALLOC = 13,
+        /// The specified GPU is not supported. It is recommended to call IsGpuSupported() for more information
+        NVPA_STATUS_UNSUPPORTED_GPU = 14,
+        /// The installed NVIDIA driver is too old.
+        NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION = 15,
+        /// UNUSED
+        NVPA_STATUS_OBJECT_NOT_REGISTERED = 16,
+        /// Profiling permission not granted; see https://developer.nvidia.com/nvidia-development-tools-solutions-
+        /// ERR_NVGPUCTRPERM-permission-issue-performance-counters
+        NVPA_STATUS_INSUFFICIENT_PRIVILEGE = 17,
+        /// UNUSED
+        NVPA_STATUS_INVALID_CONTEXT_STATE = 18,
+        /// UNUSED
+        NVPA_STATUS_INVALID_OBJECT_STATE = 19,
+        /// The request could not be fulfilled because a system resource is already in use.
+        NVPA_STATUS_RESOURCE_UNAVAILABLE = 20,
+        /// UNUSED
+        NVPA_STATUS_DRIVER_LOADED_TOO_LATE = 21,
+        /// The provided buffer is not large enough.
+        NVPA_STATUS_INSUFFICIENT_SPACE = 22,
+        /// UNUSED
+        NVPA_STATUS_OBJECT_MISMATCH = 23,
+        /// Virtualized GPU (vGPU) is not supported.
+        NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED = 24,
+        /// Profiling permission was not granted or the device was disabled.
+        NVPA_STATUS_PROFILING_NOT_ALLOWED = 25,
+        NVPA_STATUS__COUNT
+    } NVPA_Status;
+
+
+    inline void NVPW_NVPAStatusToString(NVPA_Status status, const char** ppStatusStr, const char** ppCommentStr)
+    {
+        switch (status)
+        {
+            case NVPA_STATUS_SUCCESS:
+                *ppStatusStr = "NVPA_STATUS_SUCCESS";
+                *ppCommentStr = "Success";
+                return;
+            case NVPA_STATUS_ERROR:
+                *ppStatusStr = "NVPA_STATUS_ERROR";
+                *ppCommentStr = "Generic error.";
+                return;
+            case NVPA_STATUS_INTERNAL_ERROR:
+                *ppStatusStr = "NVPA_STATUS_INTERNAL_ERROR";
+                *ppCommentStr = "Internal error.  Please file a bug!";
+                return;
+            case NVPA_STATUS_NOT_INITIALIZED:
+                *ppStatusStr = "NVPA_STATUS_NOT_INITIALIZED";
+                *ppCommentStr = "NVPW_InitializeTarget() or NVPW_InitializeHost() has not been called yet.";
+                return;
+            case NVPA_STATUS_NOT_LOADED:
+                *ppStatusStr = "NVPA_STATUS_NOT_LOADED";
+                *ppCommentStr = "The NvPerf DLL/DSO could not be loaded during NVPW_Initialize*(). Please ensure they are placed in the appropriate location that can be founder by a dynamic linker. And on Linux systems, confirm that the LD_LIBRARY_PATH environment variable is set correctly. Alternatively, you may utilize NVPW_SetLibraryLoadPaths() to define additional library search paths.";
+                return;
+            case NVPA_STATUS_FUNCTION_NOT_FOUND:
+                *ppStatusStr = "NVPA_STATUS_FUNCTION_NOT_FOUND";
+                *ppCommentStr = "The function was not found in this version of the NvPerf DLL/DSO. Or if you are directly calling NVPA_GetProcAddress(), please ensure the function name is spelled correctly.";
+                return;
+            case NVPA_STATUS_NOT_SUPPORTED:
+                *ppStatusStr = "NVPA_STATUS_NOT_SUPPORTED";
+                *ppCommentStr = "The request was intentionally not supported.";
+                return;
+            case NVPA_STATUS_NOT_IMPLEMENTED:
+                *ppStatusStr = "NVPA_STATUS_NOT_IMPLEMENTED";
+                *ppCommentStr = "The request was not implemented by this version.";
+                return;
+            case NVPA_STATUS_INVALID_ARGUMENT:
+                *ppStatusStr = "NVPA_STATUS_INVALID_ARGUMENT";
+                *ppCommentStr = "Invalid argument.";
+                return;
+            case NVPA_STATUS_INVALID_METRIC_ID:
+                *ppStatusStr = "NVPA_STATUS_INVALID_METRIC_ID";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_DRIVER_NOT_LOADED:
+                *ppStatusStr = "NVPA_STATUS_DRIVER_NOT_LOADED";
+                *ppCommentStr = "No driver has been loaded via NVPW_*_LoadDriver().";
+                return;
+            case NVPA_STATUS_OUT_OF_MEMORY:
+                *ppStatusStr = "NVPA_STATUS_OUT_OF_MEMORY";
+                *ppCommentStr = "Failed memory allocation.";
+                return;
+            case NVPA_STATUS_INVALID_THREAD_STATE:
+                *ppStatusStr = "NVPA_STATUS_INVALID_THREAD_STATE";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_FAILED_CONTEXT_ALLOC:
+                *ppStatusStr = "NVPA_STATUS_FAILED_CONTEXT_ALLOC";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_UNSUPPORTED_GPU:
+                *ppStatusStr = "NVPA_STATUS_UNSUPPORTED_GPU";
+                *ppCommentStr = "The specified GPU is not supported. It is recommended to call IsGpuSupported() for more information";
+                return;
+            case NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION:
+                *ppStatusStr = "NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION";
+                *ppCommentStr = "The installed NVIDIA driver is too old.";
+                return;
+            case NVPA_STATUS_OBJECT_NOT_REGISTERED:
+                *ppStatusStr = "NVPA_STATUS_OBJECT_NOT_REGISTERED";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_INSUFFICIENT_PRIVILEGE:
+                *ppStatusStr = "NVPA_STATUS_INSUFFICIENT_PRIVILEGE";
+                *ppCommentStr = "Profiling permission not granted; see https://developer.nvidia.com/nvidia-development-tools-solutions-ERR_NVGPUCTRPERM-permission-issue-performance-counters";
+                return;
+            case NVPA_STATUS_INVALID_CONTEXT_STATE:
+                *ppStatusStr = "NVPA_STATUS_INVALID_CONTEXT_STATE";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_INVALID_OBJECT_STATE:
+                *ppStatusStr = "NVPA_STATUS_INVALID_OBJECT_STATE";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_RESOURCE_UNAVAILABLE:
+                *ppStatusStr = "NVPA_STATUS_RESOURCE_UNAVAILABLE";
+                *ppCommentStr = "The request could not be fulfilled because a system resource is already in use.";
+                return;
+            case NVPA_STATUS_DRIVER_LOADED_TOO_LATE:
+                *ppStatusStr = "NVPA_STATUS_DRIVER_LOADED_TOO_LATE";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_INSUFFICIENT_SPACE:
+                *ppStatusStr = "NVPA_STATUS_INSUFFICIENT_SPACE";
+                *ppCommentStr = "The provided buffer is not large enough.";
+                return;
+            case NVPA_STATUS_OBJECT_MISMATCH:
+                *ppStatusStr = "NVPA_STATUS_OBJECT_MISMATCH";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED:
+                *ppStatusStr = "NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED";
+                *ppCommentStr = "Virtualized GPU (vGPU) is not supported.";
+                return;
+            case NVPA_STATUS_PROFILING_NOT_ALLOWED:
+                *ppStatusStr = "NVPA_STATUS_PROFILING_NOT_ALLOWED";
+                *ppCommentStr = "Profiling permission was not granted or the device was disabled.";
+                return;
+            default:
+                *ppStatusStr = "Unrecognized status";
+                *ppCommentStr = "This status is unrecognized. Is it coming from a newer version of NvPerf library?";
+                return;
+        }
+    }
+
+
+#endif // NVPERF_NVPA_STATUS_DEFINED
+
+
+#ifndef NVPERF_NVPA_ACTIVITY_KIND_DEFINED
+#define NVPERF_NVPA_ACTIVITY_KIND_DEFINED
+
+    /// The configuration's activity-kind dictates which types of data may be collected.
+    typedef enum NVPA_ActivityKind
+    {
+        /// Invalid value.
+        NVPA_ACTIVITY_KIND_INVALID = 0,
+        /// A workload-centric activity for serialized and pipelined collection.
+        /// 
+        /// Profiler is capable of collecting both serialized and pipelined metrics.  The library introduces any
+        /// synchronization required to collect serialized metrics.
+        NVPA_ACTIVITY_KIND_PROFILER,
+        /// A realtime activity for sampling counters from the CPU or GPU.
+        NVPA_ACTIVITY_KIND_REALTIME_SAMPLED,
+        /// A realtime activity for profiling counters from the CPU or GPU without CPU/GPU synchronizations.
+        NVPA_ACTIVITY_KIND_REALTIME_PROFILER,
+        NVPA_ACTIVITY_KIND__COUNT
+    } NVPA_ActivityKind;
+
+
+#endif // NVPERF_NVPA_ACTIVITY_KIND_DEFINED
+
+
+#ifndef NVPERF_NVPA_BOOL_DEFINED
+#define NVPERF_NVPA_BOOL_DEFINED
+    /// The type used for boolean values.
+    typedef uint8_t NVPA_Bool;
+#endif // NVPERF_NVPA_BOOL_DEFINED
+
+#ifndef NVPA_STRUCT_SIZE
+#define NVPA_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif // NVPA_STRUCT_SIZE
+
+#ifndef NVPW_FIELD_EXISTS
+#define NVPW_FIELD_EXISTS(pParams_, name_) \
+    ((pParams_)->structSize >= (size_t)((const uint8_t*)(&(pParams_)->name_) + sizeof(pParams_)->name_ - (const uint8_t*)(pParams_)))
+#endif // NVPW_FIELD_EXISTS
+
+
+#ifndef NVPERF_NVPA_GETPROCADDRESS_DEFINED
+#define NVPERF_NVPA_GETPROCADDRESS_DEFINED
+
+typedef NVPA_Status (*NVPA_GenericFn)(void);
+
+
+    /// 
+    /// Gets the address of an NvPerf API function.
+    /// 
+    /// \return A function pointer to the function, or NULL if the function is not available.
+    /// 
+    /// \param pFunctionName [in] Name of the function to retrieve.
+    NVPA_GenericFn NVPA_GetProcAddress(const char* pFunctionName);
+
+#endif
+
+#ifndef NVPERF_NVPW_SETLIBRARYLOADPATHS_DEFINED
+#define NVPERF_NVPW_SETLIBRARYLOADPATHS_DEFINED
+
+
+    typedef struct NVPW_SetLibraryLoadPaths_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] number of paths in ppPaths
+        size_t numPaths;
+        /// [in] array of null-terminated paths
+        const char** ppPaths;
+    } NVPW_SetLibraryLoadPaths_Params;
+#define NVPW_SetLibraryLoadPaths_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_SetLibraryLoadPaths_Params, ppPaths)
+
+    /// Sets library search path for \ref NVPW_InitializeHost() and \ref NVPW_InitializeTarget().
+    /// \ref NVPW_InitializeHost() and \ref NVPW_InitializeTarget load the NvPerf DLL/DSO.  This function sets
+    /// ordered paths that will be searched with the LoadLibrary() or dlopen() call.
+    /// If load paths are set by this function, the default set of load paths
+    /// will not be attempted.
+    /// Each path must point at a directory (not a file name).
+    /// This function is not thread-safe.
+    /// Example Usage:
+    /// \code
+    ///     const char* paths[] = {
+    ///         "path1", "path2", etc
+    ///     };
+    ///     NVPW_SetLibraryLoadPaths_Params params{NVPW_SetLibraryLoadPaths_Params_STRUCT_SIZE};
+    ///     params.numPaths = sizeof(paths)/sizeof(paths[0]);
+    ///     params.ppPaths = paths;
+    ///     NVPW_SetLibraryLoadPaths(&params);
+    ///     NVPW_InitializeHost();
+    ///     params.numPaths = 0;
+    ///     params.ppPaths = NULL;
+    ///     NVPW_SetLibraryLoadPaths(&params);
+    /// \endcode
+    NVPA_Status NVPW_SetLibraryLoadPaths(NVPW_SetLibraryLoadPaths_Params* pParams);
+
+    typedef struct NVPW_SetLibraryLoadPathsW_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] number of paths in ppwPaths
+        size_t numPaths;
+        /// [in] array of null-terminated paths
+        const wchar_t** ppwPaths;
+    } NVPW_SetLibraryLoadPathsW_Params;
+#define NVPW_SetLibraryLoadPathsW_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_SetLibraryLoadPathsW_Params, ppwPaths)
+
+    /// Sets library search path for \ref NVPW_InitializeHost() and \ref NVPW_InitializeTarget().
+    /// \ref NVPW_InitializeHost() and \ref NVPW_InitializeTarget load the NvPerf DLL/DSO.  This function sets
+    /// ordered paths that will be searched with the LoadLibrary() or dlopen() call.
+    /// If load paths are set by this function, the default set of load paths
+    /// will not be attempted.
+    /// Each path must point at a directory (not a file name).
+    /// This function is not thread-safe.
+    /// Example Usage:
+    /// \code
+    ///     const wchar_t* wpaths[] = {
+    ///         L"path1", L"path2", etc
+    ///     };
+    ///     NVPW_SetLibraryLoadPathsW_Params params{NVPW_SetLibraryLoadPathsW_Params_STRUCT_SIZE};
+    ///     params.numPaths = sizeof(wpaths)/sizeof(wpaths[0]);
+    ///     params.ppwPaths = wpaths;
+    ///     NVPW_SetLibraryLoadPathsW(&params);
+    ///     NVPW_InitializeHost();
+    ///     params.numPaths = 0;
+    ///     params.ppwPaths = NULL;
+    ///     NVPW_SetLibraryLoadPathsW(&params);
+    /// \endcode
+    NVPA_Status NVPW_SetLibraryLoadPathsW(NVPW_SetLibraryLoadPathsW_Params* pParams);
+
+#endif
+
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#endif // NVPERF_COMMON_H
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_host.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_host.h
new file mode 100644
index 0000000000000000000000000000000000000000..62a53528b64d6b3da8daf7058cec21781ae0e8cb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_host.h
@@ -0,0 +1,1178 @@
+#ifndef NVPERF_HOST_H
+#define NVPERF_HOST_H
+
+/*
+ * Copyright 2014-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * This software and the information contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+ * of a form of NVIDIA software license agreement.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include "nvperf_common.h"
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility push(default)
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
+    #endif
+#else
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL
+    #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ *  @file   nvperf_host.h
+ */
+
+
+// Guard against multiple definition of NvPerf host types
+#ifndef NVPERF_HOST_API_DEFINED
+#define NVPERF_HOST_API_DEFINED
+
+
+/***************************************************************************//**
+ *  @name   Host Configuration
+ *  @{
+ */
+
+    typedef struct NVPW_InitializeHost_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+    } NVPW_InitializeHost_Params;
+#define NVPW_InitializeHost_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_InitializeHost_Params, pPriv)
+
+    /// Load the host library.
+    NVPA_Status NVPW_InitializeHost(NVPW_InitializeHost_Params* pParams);
+
+    typedef struct NVPW_CounterData_CalculateCounterDataImageCopySize_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// The CounterDataPrefix generated from e.g.    nvperf2 initdata   or
+        /// NVPW_CounterDataBuilder_GetCounterDataPrefix().  Must be align(8).
+        const uint8_t* pCounterDataPrefix;
+        size_t counterDataPrefixSize;
+        /// max number of ranges that can be profiled
+        uint32_t maxNumRanges;
+        /// max number of RangeTree nodes; must be >= maxNumRanges
+        uint32_t maxNumRangeTreeNodes;
+        /// max string length of each RangeName, including the trailing NUL character
+        uint32_t maxRangeNameLength;
+        const uint8_t* pCounterDataSrc;
+        /// [out] required size of the copy buffer
+        size_t copyDataImageCounterSize;
+    } NVPW_CounterData_CalculateCounterDataImageCopySize_Params;
+#define NVPW_CounterData_CalculateCounterDataImageCopySize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_CalculateCounterDataImageCopySize_Params, copyDataImageCounterSize)
+
+    NVPA_Status NVPW_CounterData_CalculateCounterDataImageCopySize(NVPW_CounterData_CalculateCounterDataImageCopySize_Params* pParams);
+
+    typedef struct NVPW_CounterData_InitializeCounterDataImageCopy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// The CounterDataPrefix generated from e.g.    nvperf2 initdata   or
+        /// NVPW_CounterDataBuilder_GetCounterDataPrefix().  Must be align(8).
+        const uint8_t* pCounterDataPrefix;
+        size_t counterDataPrefixSize;
+        /// max number of ranges that can be profiled
+        uint32_t maxNumRanges;
+        /// max number of RangeTree nodes; must be >= maxNumRanges
+        uint32_t maxNumRangeTreeNodes;
+        /// max string length of each RangeName, including the trailing NUL character
+        uint32_t maxRangeNameLength;
+        const uint8_t* pCounterDataSrc;
+        uint8_t* pCounterDataDst;
+    } NVPW_CounterData_InitializeCounterDataImageCopy_Params;
+#define NVPW_CounterData_InitializeCounterDataImageCopy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_InitializeCounterDataImageCopy_Params, pCounterDataDst)
+
+    NVPA_Status NVPW_CounterData_InitializeCounterDataImageCopy(NVPW_CounterData_InitializeCounterDataImageCopy_Params* pParams);
+
+    typedef struct NVPW_CounterData_ExtractCounterDataPrefix_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// The source buffer to extract the prefix from.
+        const uint8_t* pCounterDataSrc;
+        size_t counterDataSrcSize;
+        /// [in] If not NULL, the prefix will be copied into this buffer.
+        uint8_t* pCounterDataPrefix;
+        /// [inout] if 'pCounterDataPrefix' is NULL, size of counter data prefix will be returned; otherwise it should
+        /// be set to the size of buffer allocated for 'pCounterDataPrefix'.
+        size_t counterDataPrefixSize;
+    } NVPW_CounterData_ExtractCounterDataPrefix_Params;
+#define NVPW_CounterData_ExtractCounterDataPrefix_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_ExtractCounterDataPrefix_Params, counterDataPrefixSize)
+
+    NVPA_Status NVPW_CounterData_ExtractCounterDataPrefix(NVPW_CounterData_ExtractCounterDataPrefix_Params* pParams);
+
+    typedef struct NVPA_CounterDataCombiner NVPA_CounterDataCombiner;
+
+    typedef struct NVPW_CounterDataCombiner_Create_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// The destination counter data into which the source datas will be combined
+        uint8_t* pCounterDataDst;
+        /// [out] The created counter data combiner
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+    } NVPW_CounterDataCombiner_Create_Params;
+#define NVPW_CounterDataCombiner_Create_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_Create_Params, pCounterDataCombiner)
+
+    NVPA_Status NVPW_CounterDataCombiner_Create(NVPW_CounterDataCombiner_Create_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_Destroy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+    } NVPW_CounterDataCombiner_Destroy_Params;
+#define NVPW_CounterDataCombiner_Destroy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_Destroy_Params, pCounterDataCombiner)
+
+    NVPA_Status NVPW_CounterDataCombiner_Destroy(NVPW_CounterDataCombiner_Destroy_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_CreateRange_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+        size_t numDescriptions;
+        const char* const* ppDescriptions;
+        /// [out]
+        size_t rangeIndexDst;
+    } NVPW_CounterDataCombiner_CreateRange_Params;
+#define NVPW_CounterDataCombiner_CreateRange_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_CreateRange_Params, rangeIndexDst)
+
+    NVPA_Status NVPW_CounterDataCombiner_CreateRange(NVPW_CounterDataCombiner_CreateRange_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_CopyIntoRange_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+        /// [in]
+        size_t rangeIndexDst;
+        /// [in]
+        const uint8_t* pCounterDataSrc;
+        /// [in]
+        size_t rangeIndexSrc;
+    } NVPW_CounterDataCombiner_CopyIntoRange_Params;
+#define NVPW_CounterDataCombiner_CopyIntoRange_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_CopyIntoRange_Params, rangeIndexSrc)
+
+    /// In order to use this API, the source counter data and the destination counter data must have identical counters
+    NVPA_Status NVPW_CounterDataCombiner_CopyIntoRange(NVPW_CounterDataCombiner_CopyIntoRange_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_AccumulateIntoRange_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+        size_t rangeIndexDst;
+        uint32_t dstMultiplier;
+        const uint8_t* pCounterDataSrc;
+        size_t rangeIndexSrc;
+        uint32_t srcMultiplier;
+    } NVPW_CounterDataCombiner_AccumulateIntoRange_Params;
+#define NVPW_CounterDataCombiner_AccumulateIntoRange_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_AccumulateIntoRange_Params, srcMultiplier)
+
+    NVPA_Status NVPW_CounterDataCombiner_AccumulateIntoRange(NVPW_CounterDataCombiner_AccumulateIntoRange_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_SumIntoRange_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+        size_t rangeIndexDst;
+        const uint8_t* pCounterDataSrc;
+        size_t rangeIndexSrc;
+    } NVPW_CounterDataCombiner_SumIntoRange_Params;
+#define NVPW_CounterDataCombiner_SumIntoRange_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_SumIntoRange_Params, rangeIndexSrc)
+
+    NVPA_Status NVPW_CounterDataCombiner_SumIntoRange(NVPW_CounterDataCombiner_SumIntoRange_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_WeightedSumIntoRange_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+        size_t rangeIndexDst;
+        double dstMultiplier;
+        const uint8_t* pCounterDataSrc;
+        size_t rangeIndexSrc;
+        double srcMultiplier;
+    } NVPW_CounterDataCombiner_WeightedSumIntoRange_Params;
+#define NVPW_CounterDataCombiner_WeightedSumIntoRange_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_WeightedSumIntoRange_Params, srcMultiplier)
+
+    NVPA_Status NVPW_CounterDataCombiner_WeightedSumIntoRange(NVPW_CounterDataCombiner_WeightedSumIntoRange_Params* pParams);
+
+/**
+ *  @}
+ ******************************************************************************/
+ 
+/***************************************************************************//**
+ *  @name   Metrics Configuration
+ *  @{
+ */
+
+    typedef struct NVPA_RawMetricsConfig NVPA_RawMetricsConfig;
+
+    typedef struct NVPA_RawMetricRequest
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// in
+        const char* pMetricName;
+        /// in
+        NVPA_Bool isolated;
+        /// in; ignored by AddMetric but observed by CounterData initialization
+        NVPA_Bool keepInstances;
+    } NVPA_RawMetricRequest;
+#define NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPA_RawMetricRequest, keepInstances)
+
+    typedef struct NVPW_GetSupportedChipNames_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [out]
+        const char* const* ppChipNames;
+        /// [out]
+        size_t numChipNames;
+    } NVPW_GetSupportedChipNames_Params;
+#define NVPW_GetSupportedChipNames_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_GetSupportedChipNames_Params, numChipNames)
+
+    NVPA_Status NVPW_GetSupportedChipNames(NVPW_GetSupportedChipNames_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_Destroy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+    } NVPW_RawMetricsConfig_Destroy_Params;
+#define NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_Destroy_Params, pRawMetricsConfig)
+
+    NVPA_Status NVPW_RawMetricsConfig_Destroy(NVPW_RawMetricsConfig_Destroy_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_SetCounterAvailability_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [in] buffer with counter availability image
+        const uint8_t* pCounterAvailabilityImage;
+    } NVPW_RawMetricsConfig_SetCounterAvailability_Params;
+#define NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_SetCounterAvailability_Params, pCounterAvailabilityImage)
+
+    NVPA_Status NVPW_RawMetricsConfig_SetCounterAvailability(NVPW_RawMetricsConfig_SetCounterAvailability_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_BeginPassGroup_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+        size_t maxPassCount;
+    } NVPW_RawMetricsConfig_BeginPassGroup_Params;
+#define NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_BeginPassGroup_Params, maxPassCount)
+
+    NVPA_Status NVPW_RawMetricsConfig_BeginPassGroup(NVPW_RawMetricsConfig_BeginPassGroup_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_EndPassGroup_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+    } NVPW_RawMetricsConfig_EndPassGroup_Params;
+#define NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_EndPassGroup_Params, pRawMetricsConfig)
+
+    NVPA_Status NVPW_RawMetricsConfig_EndPassGroup(NVPW_RawMetricsConfig_EndPassGroup_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetNumMetrics_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [out]
+        size_t numMetrics;
+    } NVPW_RawMetricsConfig_GetNumMetrics_Params;
+#define NVPW_RawMetricsConfig_GetNumMetrics_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetNumMetrics_Params, numMetrics)
+
+    NVPA_Status NVPW_RawMetricsConfig_GetNumMetrics(NVPW_RawMetricsConfig_GetNumMetrics_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetMetricProperties_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        size_t metricIndex;
+        /// [out]
+        const char* pMetricName;
+        /// [out]
+        NVPA_Bool supportsPipelined;
+        /// [out]
+        NVPA_Bool supportsIsolated;
+    } NVPW_RawMetricsConfig_GetMetricProperties_Params;
+#define NVPW_RawMetricsConfig_GetMetricProperties_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetMetricProperties_Params, supportsIsolated)
+
+    NVPA_Status NVPW_RawMetricsConfig_GetMetricProperties(NVPW_RawMetricsConfig_GetMetricProperties_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetMetricProperties_V2_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        size_t metricIndex;
+        /// [out]
+        const char* pMetricName;
+    } NVPW_RawMetricsConfig_GetMetricProperties_V2_Params;
+#define NVPW_RawMetricsConfig_GetMetricProperties_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetMetricProperties_V2_Params, pMetricName)
+
+    NVPA_Status NVPW_RawMetricsConfig_GetMetricProperties_V2(NVPW_RawMetricsConfig_GetMetricProperties_V2_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_AddMetrics_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+        const NVPA_RawMetricRequest* pRawMetricRequests;
+        size_t numMetricRequests;
+    } NVPW_RawMetricsConfig_AddMetrics_Params;
+#define NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_AddMetrics_Params, numMetricRequests)
+
+    NVPA_Status NVPW_RawMetricsConfig_AddMetrics(NVPW_RawMetricsConfig_AddMetrics_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_IsAddMetricsPossible_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        const NVPA_RawMetricRequest* pRawMetricRequests;
+        size_t numMetricRequests;
+        /// [out]
+        NVPA_Bool isPossible;
+    } NVPW_RawMetricsConfig_IsAddMetricsPossible_Params;
+#define NVPW_RawMetricsConfig_IsAddMetricsPossible_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_IsAddMetricsPossible_Params, isPossible)
+
+    NVPA_Status NVPW_RawMetricsConfig_IsAddMetricsPossible(NVPW_RawMetricsConfig_IsAddMetricsPossible_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GenerateConfigImage_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [in] If true, all existing pass groups may be merged to reduce number of passes.
+        /// If merge was successful, distribution of counters in passes may be updated as a side-effect. The effects
+        /// will be persistent in pRawMetricsConfig.
+        NVPA_Bool mergeAllPassGroups;
+    } NVPW_RawMetricsConfig_GenerateConfigImage_Params;
+#define NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GenerateConfigImage_Params, mergeAllPassGroups)
+
+    /// This API may fail if called inside a pass group with `mergeAllPassGroups` = true.
+    NVPA_Status NVPW_RawMetricsConfig_GenerateConfigImage(NVPW_RawMetricsConfig_GenerateConfigImage_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetConfigImage_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [in] Number of bytes allocated for pBuffer
+        size_t bytesAllocated;
+        /// [out] [optional] Buffer receiving the config image
+        uint8_t* pBuffer;
+        /// [out] Count of bytes that would be copied into pBuffer
+        size_t bytesCopied;
+    } NVPW_RawMetricsConfig_GetConfigImage_Params;
+#define NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetConfigImage_Params, bytesCopied)
+
+    NVPA_Status NVPW_RawMetricsConfig_GetConfigImage(NVPW_RawMetricsConfig_GetConfigImage_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetNumPasses_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [out]
+        size_t numPipelinedPasses;
+        /// [out]
+        size_t numIsolatedPasses;
+    } NVPW_RawMetricsConfig_GetNumPasses_Params;
+#define NVPW_RawMetricsConfig_GetNumPasses_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetNumPasses_Params, numIsolatedPasses)
+
+    /// Total num passes = numPipelinedPasses + numIsolatedPasses * numNestingLevels
+    NVPA_Status NVPW_RawMetricsConfig_GetNumPasses(NVPW_RawMetricsConfig_GetNumPasses_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetNumPasses_V2_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [out]
+        size_t numPasses;
+    } NVPW_RawMetricsConfig_GetNumPasses_V2_Params;
+#define NVPW_RawMetricsConfig_GetNumPasses_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetNumPasses_V2_Params, numPasses)
+
+    /// Total num passes = numPasses * numNestingLevels
+    NVPA_Status NVPW_RawMetricsConfig_GetNumPasses_V2(NVPW_RawMetricsConfig_GetNumPasses_V2_Params* pParams);
+
+    typedef struct NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] Typically created by e.g. NVPW_RawMetricsConfig_GetConfigImage(), must be align(8).
+        const uint8_t* pConfig;
+        /// [in]
+        size_t configSize;
+        /// [out]
+        size_t sampleSize;
+    } NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize_Params;
+#define NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize_Params, sampleSize)
+
+    /// Estimate per sample records size based on a virtual device
+    NVPA_Status NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize(NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize_Params* pParams);
+
+    typedef struct NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] Typically created by e.g. NVPW_RawMetricsConfig_GetConfigImage(), must be align(8).
+        const uint8_t* pConfig;
+        /// [in]
+        size_t configSize;
+        /// [out]
+        size_t sampleSize;
+    } NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize_Params;
+#define NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize_Params, sampleSize)
+
+    /// Estimate per sample records size based on a virtual device
+    NVPA_Status NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize(NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize_Params* pParams);
+
+/**
+ *  @}
+ ******************************************************************************/
+ 
+    typedef struct NVPW_Config_GetRawCounterInfo_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pConfig;
+        /// [in]
+        size_t configSize;
+        /// [in]
+        const char* pRawCounterName;
+        /// [inout] array containing indices of passes the counter resides in. 'pPassIndices' is in, '*pPassIndices' is
+        /// out.
+        size_t* pPassIndices;
+        /// [inout] if 'pPassIndices' is NULL, the count of passes this counter resides in will be returned; otherwise
+        /// it should be set to the capacity of 'pPassIndices' array, and on return, it will be overwritten to reflect
+        /// the actual count filled into 'pPassIndices'
+        size_t numPassIndices;
+    } NVPW_Config_GetRawCounterInfo_Params;
+#define NVPW_Config_GetRawCounterInfo_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetRawCounterInfo_Params, numPassIndices)
+
+    NVPA_Status NVPW_Config_GetRawCounterInfo(NVPW_Config_GetRawCounterInfo_Params* pParams);
+
+    typedef struct NVPW_Config_GetRawCounters_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pConfig;
+        /// [in]
+        size_t configSize;
+        /// [in]
+        size_t passIndex;
+        /// [inout] array containing raw counter names. 'ppRawCounterNames' is in, '*ppRawCounterNames' is out.
+        const char** ppRawCounterNames;
+        /// [inout] if 'ppRawCounterNames' is NULL, the count of raw counters will be returned; otherwise it should be
+        /// set to the capacity of 'ppRawCounterNames' array, and on return, it will be overwritten to reflect the
+        /// actual count filled into 'ppRawCounterNames'
+        size_t numRawCounters;
+    } NVPW_Config_GetRawCounters_Params;
+#define NVPW_Config_GetRawCounters_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetRawCounters_Params, numRawCounters)
+
+    NVPA_Status NVPW_Config_GetRawCounters(NVPW_Config_GetRawCounters_Params* pParams);
+
+/***************************************************************************//**
+ *  @name   CounterData Creation
+ *  @{
+ */
+
+    typedef struct NVPA_CounterDataBuilder NVPA_CounterDataBuilder;
+
+    typedef struct NVPW_CounterDataBuilder_Create_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [out]
+        NVPA_CounterDataBuilder* pCounterDataBuilder;
+        const char* pChipName;
+    } NVPW_CounterDataBuilder_Create_Params;
+#define NVPW_CounterDataBuilder_Create_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataBuilder_Create_Params, pChipName)
+
+    NVPA_Status NVPW_CounterDataBuilder_Create(NVPW_CounterDataBuilder_Create_Params* pParams);
+
+    typedef struct NVPW_CounterDataBuilder_Destroy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataBuilder* pCounterDataBuilder;
+    } NVPW_CounterDataBuilder_Destroy_Params;
+#define NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataBuilder_Destroy_Params, pCounterDataBuilder)
+
+    NVPA_Status NVPW_CounterDataBuilder_Destroy(NVPW_CounterDataBuilder_Destroy_Params* pParams);
+
+    typedef struct NVPW_CounterDataBuilder_AddMetrics_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataBuilder* pCounterDataBuilder;
+        const NVPA_RawMetricRequest* pRawMetricRequests;
+        size_t numMetricRequests;
+    } NVPW_CounterDataBuilder_AddMetrics_Params;
+#define NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataBuilder_AddMetrics_Params, numMetricRequests)
+
+    NVPA_Status NVPW_CounterDataBuilder_AddMetrics(NVPW_CounterDataBuilder_AddMetrics_Params* pParams);
+
+    typedef struct NVPW_CounterDataBuilder_GetCounterDataPrefix_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataBuilder* pCounterDataBuilder;
+        /// [in] Number of bytes allocated for pBuffer
+        size_t bytesAllocated;
+        /// [out] [optional] Buffer receiving the counter data prefix
+        uint8_t* pBuffer;
+        /// [out] Count of bytes that would be copied to pBuffer
+        size_t bytesCopied;
+    } NVPW_CounterDataBuilder_GetCounterDataPrefix_Params;
+#define NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataBuilder_GetCounterDataPrefix_Params, bytesCopied)
+
+    NVPA_Status NVPW_CounterDataBuilder_GetCounterDataPrefix(NVPW_CounterDataBuilder_GetCounterDataPrefix_Params* pParams);
+
+/**
+ *  @}
+ ******************************************************************************/
+ 
+/***************************************************************************//**
+ *  @name   Metrics Evaluator
+ *  @{
+ */
+
+    typedef struct NVPW_MetricsEvaluator NVPW_MetricsEvaluator;
+
+#ifndef NVPW_DIM_UNIT_DEFINED
+#define NVPW_DIM_UNIT_DEFINED
+    typedef enum NVPW_DimUnitName
+    {
+        NVPW_DIM_UNIT_INVALID = 3518299157,
+        NVPW_DIM_UNIT_UNITLESS = 2126137902,
+        NVPW_DIM_UNIT_ATTRIBUTES = 3776338729,
+        NVPW_DIM_UNIT_BYTES = 3797850191,
+        NVPW_DIM_UNIT_CTAS = 1960564139,
+        NVPW_DIM_UNIT_CTC_CYCLES = 2224883873,
+        NVPW_DIM_UNIT_DRAM_CYCLES = 2650981327,
+        NVPW_DIM_UNIT_FBP_CYCLES = 1785238957,
+        NVPW_DIM_UNIT_FE_OPS = 2919159083,
+        NVPW_DIM_UNIT_GPC_CYCLES = 1222631184,
+        NVPW_DIM_UNIT_IDC_REQUESTS = 2012649669,
+        NVPW_DIM_UNIT_INSTRUCTIONS = 1418625543,
+        NVPW_DIM_UNIT_KILOBYTES = 1335980302,
+        NVPW_DIM_UNIT_L1DATA_BANK_ACCESSES = 1479493682,
+        NVPW_DIM_UNIT_L1DATA_BANK_CONFLICTS = 3433170787,
+        NVPW_DIM_UNIT_L1TEX_REQUESTS = 1306473767,
+        NVPW_DIM_UNIT_L1TEX_TAGS = 26573010,
+        NVPW_DIM_UNIT_L1TEX_WAVEFRONTS = 129373765,
+        NVPW_DIM_UNIT_L2_REQUESTS = 1143695106,
+        NVPW_DIM_UNIT_L2_SECTORS = 3424101564,
+        NVPW_DIM_UNIT_L2_TAGS = 3755612781,
+        NVPW_DIM_UNIT_LRC_REQUESTS = 2280914327,
+        NVPW_DIM_UNIT_LRC_SECTORS = 7212034,
+        NVPW_DIM_UNIT_MCC_CYCLES = 1826685787,
+        NVPW_DIM_UNIT_NANOSECONDS = 3047500672,
+        NVPW_DIM_UNIT_NVDLA_CYCLES = 3374059789,
+        NVPW_DIM_UNIT_NVENC_CYCLES = 2267185244,
+        NVPW_DIM_UNIT_NVLRX_CYCLES = 4059934930,
+        NVPW_DIM_UNIT_NVLTX_CYCLES = 1814350488,
+        NVPW_DIM_UNIT_OFA_CYCLES = 4290210307,
+        NVPW_DIM_UNIT_PCIE_CYCLES = 1230450943,
+        NVPW_DIM_UNIT_PERCENT = 1284354694,
+        NVPW_DIM_UNIT_PIXELS = 4227616663,
+        NVPW_DIM_UNIT_PIXEL_SHADER_BARRIERS = 3705502518,
+        NVPW_DIM_UNIT_PRIMITIVES = 2373084002,
+        NVPW_DIM_UNIT_PVAVPU_CYCLES = 2238259366,
+        NVPW_DIM_UNIT_PVA_CYCLES = 202044173,
+        NVPW_DIM_UNIT_QUADS = 1539753497,
+        NVPW_DIM_UNIT_REGISTERS = 2837260947,
+        NVPW_DIM_UNIT_SAMPLES = 746046551,
+        NVPW_DIM_UNIT_SECONDS = 1164825258,
+        NVPW_DIM_UNIT_SYSL2_REQUESTS = 2165109286,
+        NVPW_DIM_UNIT_SYSL2_SECTORS = 2268734175,
+        NVPW_DIM_UNIT_SYSL2_TAGS = 3308651352,
+        NVPW_DIM_UNIT_SYSLRC_REQUESTS = 3328245480,
+        NVPW_DIM_UNIT_SYSLRC_SECTORS = 1190477493,
+        NVPW_DIM_UNIT_SYS_CYCLES = 3310821688,
+        NVPW_DIM_UNIT_TEXELS = 1293214069,
+        NVPW_DIM_UNIT_THREADS = 164261907,
+        NVPW_DIM_UNIT_TMEM_ACCESSES = 3742902067,
+        NVPW_DIM_UNIT_VERTICES = 1873662209,
+        NVPW_DIM_UNIT_VIC_CYCLES = 103143588,
+        NVPW_DIM_UNIT_WARPS = 97951949,
+        NVPW_DIM_UNIT_WORKIDS = 1971113483,
+        NVPW_DIM_UNIT_WORKLOADS = 1728142656
+    } NVPW_DimUnitName;
+#endif //NVPW_DIM_UNIT_DEFINED
+
+#ifndef NVPW_HW_UNIT_DEFINED
+#define NVPW_HW_UNIT_DEFINED
+    typedef enum NVPW_HwUnit
+    {
+        NVPW_HW_UNIT_INVALID = 3498035701,
+        NVPW_HW_UNIT_CROP = 2872137846,
+        NVPW_HW_UNIT_CTC = 4123164475,
+        NVPW_HW_UNIT_DRAM = 1662616918,
+        NVPW_HW_UNIT_DRAMC = 1401232876,
+        NVPW_HW_UNIT_FBP = 2947194306,
+        NVPW_HW_UNIT_FBPA = 690045803,
+        NVPW_HW_UNIT_FE = 2204924321,
+        NVPW_HW_UNIT_GPC = 1911735839,
+        NVPW_HW_UNIT_GPU = 1014363534,
+        NVPW_HW_UNIT_GR = 2933618517,
+        NVPW_HW_UNIT_IDC = 842765289,
+        NVPW_HW_UNIT_L1TEX = 893940957,
+        NVPW_HW_UNIT_LRC = 4004756136,
+        NVPW_HW_UNIT_LTS = 2333266697,
+        NVPW_HW_UNIT_MCC = 3980130194,
+        NVPW_HW_UNIT_NVDLA = 4201167892,
+        NVPW_HW_UNIT_NVENC = 207708260,
+        NVPW_HW_UNIT_NVLRX = 3091684901,
+        NVPW_HW_UNIT_NVLTX = 869679659,
+        NVPW_HW_UNIT_OFA = 70307371,
+        NVPW_HW_UNIT_PCIE = 3433264174,
+        NVPW_HW_UNIT_PDA = 345193251,
+        NVPW_HW_UNIT_PES = 804128425,
+        NVPW_HW_UNIT_PROP = 3339255507,
+        NVPW_HW_UNIT_PVA = 2565499490,
+        NVPW_HW_UNIT_PVAVPU = 1656645655,
+        NVPW_HW_UNIT_RASTER = 187932504,
+        NVPW_HW_UNIT_SM = 724224710,
+        NVPW_HW_UNIT_SMSP = 2837616917,
+        NVPW_HW_UNIT_SYS = 768990063,
+        NVPW_HW_UNIT_SYSLRC = 3247626950,
+        NVPW_HW_UNIT_SYSLTS = 4137740217,
+        NVPW_HW_UNIT_TPC = 1889024613,
+        NVPW_HW_UNIT_VAF = 753670509,
+        NVPW_HW_UNIT_VIC = 322439594,
+        NVPW_HW_UNIT_VPC = 275561583,
+        NVPW_HW_UNIT_ZCULL = 2401248356,
+        NVPW_HW_UNIT_ZROP = 979500456
+    } NVPW_HwUnit;
+#endif //NVPW_HW_UNIT_DEFINED
+
+    typedef enum NVPW_RollupOp
+    {
+        NVPW_ROLLUP_OP_AVG = 0,
+        NVPW_ROLLUP_OP_MAX,
+        NVPW_ROLLUP_OP_MIN,
+        NVPW_ROLLUP_OP_SUM,
+        NVPW_ROLLUP_OP__COUNT
+    } NVPW_RollupOp;
+
+    typedef enum NVPW_MetricType
+    {
+        NVPW_METRIC_TYPE_COUNTER = 0,
+        NVPW_METRIC_TYPE_RATIO,
+        NVPW_METRIC_TYPE_THROUGHPUT,
+        NVPW_METRIC_TYPE__COUNT
+    } NVPW_MetricType;
+
+    typedef enum NVPW_Submetric
+    {
+        NVPW_SUBMETRIC_NONE = 0,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED = 1,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_ACTIVE = 2,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_ACTIVE_PER_SECOND = 3,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_ELAPSED = 4,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_ELAPSED_PER_SECOND = 5,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_FRAME = 6,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_FRAME_PER_SECOND = 7,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_REGION = 8,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_REGION_PER_SECOND = 9,
+        NVPW_SUBMETRIC_PER_CYCLE_ACTIVE = 10,
+        NVPW_SUBMETRIC_PER_CYCLE_ELAPSED = 11,
+        NVPW_SUBMETRIC_PER_CYCLE_IN_FRAME = 12,
+        NVPW_SUBMETRIC_PER_CYCLE_IN_REGION = 13,
+        NVPW_SUBMETRIC_PER_SECOND = 14,
+        NVPW_SUBMETRIC_PCT_OF_PEAK_SUSTAINED_ACTIVE = 15,
+        NVPW_SUBMETRIC_PCT_OF_PEAK_SUSTAINED_ELAPSED = 16,
+        NVPW_SUBMETRIC_PCT_OF_PEAK_SUSTAINED_FRAME = 17,
+        NVPW_SUBMETRIC_PCT_OF_PEAK_SUSTAINED_REGION = 18,
+        NVPW_SUBMETRIC_MAX_RATE = 19,
+        NVPW_SUBMETRIC_PCT = 20,
+        NVPW_SUBMETRIC_RATIO = 21,
+        NVPW_SUBMETRIC__COUNT
+    } NVPW_Submetric;
+
+    typedef struct NVPW_MetricEvalRequest
+    {
+        /// the metric index as in 'NVPW_MetricsEvaluator_GetMetricNames'
+        size_t metricIndex;
+        /// one of 'NVPW_MetricType'
+        uint8_t metricType;
+        /// one of 'NVPW_RollupOp', required for Counter and Throughput, doesn't apply to Ratio
+        uint8_t rollupOp;
+        /// one of 'NVPW_Submetric', required for Ratio and Throughput, optional for Counter
+        uint16_t submetric;
+    } NVPW_MetricEvalRequest;
+#define NVPW_MetricEvalRequest_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricEvalRequest, submetric)
+
+    typedef struct NVPW_DimUnitFactor
+    {
+        /// one of 'NVPW_DimUnitName'
+        uint32_t dimUnit;
+        int8_t exponent;
+    } NVPW_DimUnitFactor;
+#define NVPW_DimUnitFactor_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_DimUnitFactor, exponent)
+
+    typedef struct NVPW_MetricsEvaluator_Destroy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+    } NVPW_MetricsEvaluator_Destroy_Params;
+#define NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_Destroy_Params, pMetricsEvaluator)
+
+    NVPA_Status NVPW_MetricsEvaluator_Destroy(NVPW_MetricsEvaluator_Destroy_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetMetricNames_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] one of 'NVPW_MetricType'
+        uint8_t metricType;
+        /// [out]
+        const char* pMetricNames;
+        /// [out]
+        const size_t* pMetricNameBeginIndices;
+        /// [out]
+        size_t numMetrics;
+    } NVPW_MetricsEvaluator_GetMetricNames_Params;
+#define NVPW_MetricsEvaluator_GetMetricNames_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetMetricNames_Params, numMetrics)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetMetricNames(NVPW_MetricsEvaluator_GetMetricNames_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetMetricTypeAndIndex_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] can be either a base metric or a metric
+        const char* pMetricName;
+        /// [out] one of 'NVPW_MetricType'
+        uint8_t metricType;
+        /// [out] the metric index as in 'NVPW_MetricsEvaluator_GetMetricNames'
+        size_t metricIndex;
+    } NVPW_MetricsEvaluator_GetMetricTypeAndIndex_Params;
+#define NVPW_MetricsEvaluator_GetMetricTypeAndIndex_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetMetricTypeAndIndex_Params, metricIndex)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetMetricTypeAndIndex(NVPW_MetricsEvaluator_GetMetricTypeAndIndex_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in]
+        const char* pMetricName;
+        /// [inout] 'pMetricEvalRequest' is in, '*pMetricEvalRequest' is out
+        struct NVPW_MetricEvalRequest* pMetricEvalRequest;
+        /// [in] set to 'NVPW_MetricEvalRequest_STRUCT_SIZE'
+        size_t metricEvalRequestStructSize;
+    } NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params;
+#define NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params, metricEvalRequestStructSize)
+
+    NVPA_Status NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_HwUnitToString_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] one of 'NVPW_HwUnit'
+        uint32_t hwUnit;
+        /// [out]
+        const char* pHwUnitName;
+    } NVPW_MetricsEvaluator_HwUnitToString_Params;
+#define NVPW_MetricsEvaluator_HwUnitToString_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_HwUnitToString_Params, pHwUnitName)
+
+    NVPA_Status NVPW_MetricsEvaluator_HwUnitToString(NVPW_MetricsEvaluator_HwUnitToString_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetCounterProperties_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] the metric index as in 'NVPW_MetricsEvaluator_GetMetricNames'
+        size_t counterIndex;
+        /// [out]
+        const char* pDescription;
+        /// [out] one of 'NVPW_HwUnit'
+        uint32_t hwUnit;
+    } NVPW_MetricsEvaluator_GetCounterProperties_Params;
+#define NVPW_MetricsEvaluator_GetCounterProperties_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetCounterProperties_Params, hwUnit)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetCounterProperties(NVPW_MetricsEvaluator_GetCounterProperties_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetRatioMetricProperties_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] the metric index as in 'NVPW_MetricsEvaluator_GetMetricNames'
+        size_t ratioMetricIndex;
+        /// [out]
+        const char* pDescription;
+        /// [out]
+        uint64_t hwUnit;
+    } NVPW_MetricsEvaluator_GetRatioMetricProperties_Params;
+#define NVPW_MetricsEvaluator_GetRatioMetricProperties_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetRatioMetricProperties_Params, hwUnit)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetRatioMetricProperties(NVPW_MetricsEvaluator_GetRatioMetricProperties_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetThroughputMetricProperties_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] the metric index as in 'NVPW_MetricsEvaluator_GetMetricNames'
+        size_t throughputMetricIndex;
+        /// [out]
+        const char* pDescription;
+        /// [out]
+        uint32_t hwUnit;
+        /// [out] number of constituent counters for the throughput metric
+        size_t numCounters;
+        /// [out] metric indices as in 'NVPW_MetricsEvaluator_GetMetricNames', valid if 'numCounters' > 0, otherwise
+        /// returned as nullptr
+        const size_t* pCounterIndices;
+        /// [out] number of constituent sub-throughputs for the throughput metric
+        size_t numSubThroughputs;
+        /// [out] metric indices as in 'NVPW_MetricsEvaluator_GetMetricNames', valid if 'numSubThroughputs' > 0,
+        /// otherwise returned as nullptr
+        const size_t* pSubThroughputIndices;
+    } NVPW_MetricsEvaluator_GetThroughputMetricProperties_Params;
+#define NVPW_MetricsEvaluator_GetThroughputMetricProperties_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetThroughputMetricProperties_Params, pSubThroughputIndices)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetThroughputMetricProperties(NVPW_MetricsEvaluator_GetThroughputMetricProperties_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetSupportedSubmetrics_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] one of 'NVPW_MetricType'
+        uint8_t metricType;
+        /// [out] an array of 'NVPW_Submetric'
+        const uint16_t* pSupportedSubmetrics;
+        /// [out]
+        size_t numSupportedSubmetrics;
+    } NVPW_MetricsEvaluator_GetSupportedSubmetrics_Params;
+#define NVPW_MetricsEvaluator_GetSupportedSubmetrics_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetSupportedSubmetrics_Params, numSupportedSubmetrics)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetSupportedSubmetrics(NVPW_MetricsEvaluator_GetSupportedSubmetrics_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetMetricRawDependencies_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in]
+        const struct NVPW_MetricEvalRequest* pMetricEvalRequests;
+        /// [in]
+        size_t numMetricEvalRequests;
+        /// [in] set to 'NVPW_MetricEvalRequest_STRUCT_SIZE'
+        size_t metricEvalRequestStructSize;
+        /// [in] set to sizeof('NVPW_MetricEvalRequest')
+        size_t metricEvalRequestStrideSize;
+        /// [inout] 'ppRawDependencies' is in, '*ppRawDependencies' is out
+        const char** ppRawDependencies;
+        /// [inout] if 'ppRawDependencies' is NULL, number of raw dependencies available will be returned; otherwise it
+        /// should be set to the number of elements allocated for 'ppRawDependencies', and on return, it will be
+        /// overwritten by number of elements copied to 'ppRawDependencies'
+        size_t numRawDependencies;
+        /// [inout] 'ppOptionalRawDependencies' is in, '*ppOptionalRawDependencies' is out
+        const char** ppOptionalRawDependencies;
+        /// [inout] if 'ppOptionalRawDependencies' is NULL, number of optional raw dependencies available will be
+        /// returned; otherwise it should be set to the number of elements allocated for 'ppOptionalRawDependencies',
+        /// and on return, it will be overwritten by number of elements copied to 'ppOptionalRawDependencies'
+        size_t numOptionalRawDependencies;
+    } NVPW_MetricsEvaluator_GetMetricRawDependencies_Params;
+#define NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetMetricRawDependencies_Params, numOptionalRawDependencies)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetMetricRawDependencies(NVPW_MetricsEvaluator_GetMetricRawDependencies_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_DimUnitToString_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] one of 'NVPW_DimUnitName'
+        uint32_t dimUnit;
+        /// [out]
+        const char* pSingularName;
+        /// [out]
+        const char* pPluralName;
+    } NVPW_MetricsEvaluator_DimUnitToString_Params;
+#define NVPW_MetricsEvaluator_DimUnitToString_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_DimUnitToString_Params, pPluralName)
+
+    NVPA_Status NVPW_MetricsEvaluator_DimUnitToString(NVPW_MetricsEvaluator_DimUnitToString_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetMetricDimUnits_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in]
+        const struct NVPW_MetricEvalRequest* pMetricEvalRequest;
+        /// [in] set to 'NVPW_MetricEvalRequest_STRUCT_SIZE'
+        size_t metricEvalRequestStructSize;
+        /// [inout] 'pDimUnits' is in, '*pDimUnits' is out
+        NVPW_DimUnitFactor* pDimUnits;
+        /// [inout] if 'pDimUnits' is NULL, number of dim-units available will be returned; otherwise it should be set
+        /// to the number of elements allocated for 'pDimUnits', and on return, it will be overwritten by number of
+        /// elements copied to 'pDimUnits'
+        size_t numDimUnits;
+        /// [in] set to 'NVPW_DimUnitFactor_STRUCT_SIZE'
+        size_t dimUnitFactorStructSize;
+    } NVPW_MetricsEvaluator_GetMetricDimUnits_Params;
+#define NVPW_MetricsEvaluator_GetMetricDimUnits_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetMetricDimUnits_Params, dimUnitFactorStructSize)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetMetricDimUnits(NVPW_MetricsEvaluator_GetMetricDimUnits_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_SetUserData_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] duration in ns of user defined frame
+        double frameDuration;
+        /// [in] duration in ns of user defined region
+        double regionDuration;
+        /// [in]
+        NVPA_Bool isolated;
+    } NVPW_MetricsEvaluator_SetUserData_Params;
+#define NVPW_MetricsEvaluator_SetUserData_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_SetUserData_Params, isolated)
+
+    NVPA_Status NVPW_MetricsEvaluator_SetUserData(NVPW_MetricsEvaluator_SetUserData_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_EvaluateToGpuValues_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in]
+        const struct NVPW_MetricEvalRequest* pMetricEvalRequests;
+        /// [in]
+        size_t numMetricEvalRequests;
+        /// [in] set to 'NVPW_MetricEvalRequest_STRUCT_SIZE'
+        size_t metricEvalRequestStructSize;
+        /// [in] set to sizeof('NVPW_MetricEvalRequest')
+        size_t metricEvalRequestStrideSize;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [in]
+        size_t rangeIndex;
+        /// [in]
+        NVPA_Bool isolated;
+        /// [inout] 'pMetricValues' is in, '*pMetricValues' is out
+        double* pMetricValues;
+    } NVPW_MetricsEvaluator_EvaluateToGpuValues_Params;
+#define NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_EvaluateToGpuValues_Params, pMetricValues)
+
+    NVPA_Status NVPW_MetricsEvaluator_EvaluateToGpuValues(NVPW_MetricsEvaluator_EvaluateToGpuValues_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_SetDeviceAttributes_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+    } NVPW_MetricsEvaluator_SetDeviceAttributes_Params;
+#define NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_SetDeviceAttributes_Params, counterDataImageSize)
+
+    NVPA_Status NVPW_MetricsEvaluator_SetDeviceAttributes(NVPW_MetricsEvaluator_SetDeviceAttributes_Params* pParams);
+
+/**
+ *  @}
+ ******************************************************************************/
+ 
+
+#endif // NVPERF_HOST_API_DEFINED
+
+
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#endif // NVPERF_HOST_H
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_atomic_functions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fa9d6f2f96f48ec46d2da816256be238cf70343
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_atomic_functions.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__SM_20_ATOMIC_FUNCTIONS_H__)
+#define __SM_20_ATOMIC_FUNCTIONS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__
+#elif defined(_NVHPC_CUDA)
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ extern __device__ __cudart_builtin__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val) __DEF_IF_HOST
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_20_ATOMIC_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_20_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
+
+#endif /* !__SM_20_ATOMIC_FUNCTIONS_H__ */
+
+#undef EXCLUDE_FROM_RTC
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_atomic_functions.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d13b04ae4fb58970eb17ffefc08ae112e3cd24f4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_atomic_functions.hpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_20_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_20_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
+extern "C"
+{
+extern __device__ __device_builtin__ float __fAtomicAdd(float *address, float val);
+}
+#endif /* __CUDA_ARCH__ */
+
+#if defined(__CUDACC_RTC__)
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val)
+{
+  return __fAtomicAdd(address, val);
+}
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_20_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__SM_20_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_intrinsics.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_intrinsics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..53cac60018360d7ab814092118ba9db4c03fcba9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_20_intrinsics.hpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_20_INTRINSICS_HPP__)
+#define __SM_20_INTRINSICS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_20_INTRINSICS_DECL__ __device__
+#define __COMMON_INTRINSICS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_INTRINSICS_DECL__ static __inline__ __device__
+#define __COMMON_INTRINSICS_DECL__ static __inline__ __host__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_20_INTRINSICS_DECL__ unsigned int ballot(bool pred)
+{
+  return __ballot((int)pred);
+}
+
+__SM_20_INTRINSICS_DECL__ int syncthreads_count(bool pred)
+{
+  return __syncthreads_count((int)pred);
+}
+
+__SM_20_INTRINSICS_DECL__ bool syncthreads_and(bool pred)
+{
+  return (bool)__syncthreads_and((int)pred);
+}
+
+__SM_20_INTRINSICS_DECL__ bool syncthreads_or(bool pred)
+{
+  return (bool)__syncthreads_or((int)pred);
+}
+
+
+extern "C" {
+  __device__ unsigned __nv_isGlobal_impl(const void *);
+  __device__ unsigned __nv_isShared_impl(const void *);
+  __device__ unsigned __nv_isConstant_impl(const void *);
+  __device__ unsigned __nv_isLocal_impl(const void *);
+  __device__ unsigned __nv_isGridConstant_impl(const void *);
+}
+
+__SM_20_INTRINSICS_DECL__ unsigned int __isGlobal(const void *ptr)
+{
+  return __nv_isGlobal_impl(ptr); 
+}
+
+__SM_20_INTRINSICS_DECL__ unsigned int __isShared(const void *ptr)
+{
+  return __nv_isShared_impl(ptr); 
+}
+
+__SM_20_INTRINSICS_DECL__ unsigned int __isConstant(const void *ptr)
+{
+  return __nv_isConstant_impl(ptr); 
+}
+
+__SM_20_INTRINSICS_DECL__ unsigned int __isLocal(const void *ptr)
+{
+  return __nv_isLocal_impl(ptr); 
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+__SM_20_INTRINSICS_DECL__ unsigned int __isGridConstant(const void *ptr)
+{
+  return __nv_isGridConstant_impl(ptr); 
+}
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+
+extern "C" {
+  __device__ size_t __nv_cvta_generic_to_global_impl(const void *);
+  __device__ size_t __nv_cvta_generic_to_shared_impl(const void *);
+  __device__ size_t __nv_cvta_generic_to_constant_impl(const void *);
+  __device__ size_t __nv_cvta_generic_to_local_impl(const void *);
+  __device__ void * __nv_cvta_global_to_generic_impl(size_t);
+  __device__ void * __nv_cvta_shared_to_generic_impl(size_t);
+  __device__ void * __nv_cvta_constant_to_generic_impl(size_t);
+  __device__ void * __nv_cvta_local_to_generic_impl(size_t);
+}
+
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_global(const void *p)
+{
+  return __nv_cvta_generic_to_global_impl(p);
+}
+
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_shared(const void *p)
+{
+  return __nv_cvta_generic_to_shared_impl(p);
+}
+
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_constant(const void *p)
+{
+  return __nv_cvta_generic_to_constant_impl(p);
+}
+
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_local(const void *p)
+{
+  return __nv_cvta_generic_to_local_impl(p);
+}
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_global_to_generic(size_t rawbits)
+{
+  return __nv_cvta_global_to_generic_impl(rawbits);
+}
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_shared_to_generic(size_t rawbits)
+{
+  return __nv_cvta_shared_to_generic_impl(rawbits);
+}
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_constant_to_generic(size_t rawbits)
+{
+  return __nv_cvta_constant_to_generic_impl(rawbits);
+}
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_local_to_generic(size_t rawbits)
+{
+  return __nv_cvta_local_to_generic_impl(rawbits);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __CVTA_PTR_64 1
+#endif
+
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_grid_constant(const void *ptr)
+{
+#if __CVTA_PTR_64  
+  unsigned long long ret;
+  asm("cvta.to.param.u64 %0, %1;"  : "=l"(ret) : "l"(ptr));
+#else  /* !__CVTA_PTR_64 */
+  unsigned ret;
+  asm("cvta.to.param.u32 %0, %1;"  : "=r"(ret) : "r"(ptr));
+#endif  /* __CVTA_PTR_64 */  
+  return (size_t)ret;
+  
+}
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_grid_constant_to_generic(size_t rawbits)
+{
+  void *ret;
+#if __CVTA_PTR_64  
+  unsigned long long in = rawbits;
+  asm("cvta.param.u64 %0, %1;" : "=l"(ret) : "l"(in));
+#else  /* !__CVTA_PTR_64 */
+  unsigned in = rawbits;
+  asm("cvta.param.u32 %0, %1;" : "=r"(ret) : "r"(in));
+#endif  /* __CVTA_PTR_64 */
+  return ret;
+}
+#undef __CVTA_PTR_64
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+
+extern "C" {
+  __device__ unsigned short __nv_bswap16_impl(unsigned short);
+  __device__ unsigned int __nv_bswap32_impl(unsigned int);
+  __device__ unsigned long long __nv_bswap64_impl(unsigned long long);
+}
+
+__COMMON_INTRINSICS_DECL__ unsigned short __nv_bswap16(unsigned short in) {
+  return __nv_bswap16_impl(in);
+}
+
+__COMMON_INTRINSICS_DECL__ unsigned int __nv_bswap32(unsigned int in) {
+  return __nv_bswap32_impl(in);
+}
+
+__COMMON_INTRINSICS_DECL__ unsigned long long __nv_bswap64(unsigned long long in) {
+  return __nv_bswap64_impl(in);
+}
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_20_INTRINSICS_DECL__
+#undef __COMMON_INTRINSICS_DECL__
+
+#endif /* !__SM_20_INTRINSICS_HPP__ */
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_30_intrinsics.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_30_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad3bfc500fefaaefdc801e2fe42c27def1ddaa58
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_30_intrinsics.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_30_INTRINSICS_H__)
+#define __SM_30_INTRINSICS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_30_INTRINSICS_DECL__ __device__
+#elif defined(_NVHPC_CUDA)
+#define __SM_30_INTRINSICS_DECL__ extern __device__ __cudart_builtin__
+#else /* !__CUDACC_RTC__ */
+#define __SM_30_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/* Add !defined(_NVHPC_CUDA) to avoid empty function definition in CUDA
+ * C++ compiler where the macro __CUDA_ARCH__ is not defined. */
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+/*******************************************************************************
+*                                                                              *
+*  Below are declarations of SM-3.0 intrinsics which are included as           *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+#if defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if defined(_NVHPC_CUDA)
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on cc70 and above, and should be replaced with "#x"_sync()."
+#elif !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
+#endif
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Find the position of the n-th set to 1 bit in a 32-bit integer.
+ *
+ * Given a 32-bit value \p mask and an integer value \p base (between 0 and 31),
+ * find the n-th (given by \p offset) set bit in \p mask from the \p base bit.
+ * If not found, return 0xFFFFFFFF.
+ *
+ * See also https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#integer-arithmetic-instructions-fns
+ * for more information.
+ *
+ * \return Returns a value between 0 and 32 inclusive representing the position
+ * of the n-th set bit.
+ * - parameter \p base must be <=31, otherwise behavior is undefined.
+ */
+__SM_30_INTRINSICS_DECL__ unsigned  __fns(unsigned mask, unsigned base, int offset) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ void  __barrier_sync(unsigned id) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ void  __barrier_sync_count(unsigned id, unsigned cnt) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ void  __syncwarp(unsigned mask=0xFFFFFFFF) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __all_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __any_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __uni_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned __ballot_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned __activemask() __DEF_IF_HOST
+
+// Warp register exchange (shuffle) intrinsics.
+// Notes:
+// a) Warp size is hardcoded to 32 here, because the compiler does not know
+//    the "warpSize" constant at this time
+// b) we cannot map the float __shfl to the int __shfl because it'll mess with
+//    the register number (especially if you're doing two shfls to move a double).
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) int __shfl(int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned int __shfl(unsigned int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) int __shfl_up(int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned int __shfl_up(unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) int __shfl_down(int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned int __shfl_down(unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) int __shfl_xor(int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned int __shfl_xor(unsigned int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) float __shfl(float var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) float __shfl_up(float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) float __shfl_down(float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) float __shfl_xor(float var, int laneMask, int width=warpSize) __DEF_IF_HOST
+#endif
+
+__SM_30_INTRINSICS_DECL__ int __shfl_sync(unsigned mask, int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_sync(unsigned mask, unsigned int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __shfl_up_sync(unsigned mask, int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up_sync(unsigned mask, unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __shfl_down_sync(unsigned mask, int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down_sync(unsigned mask, unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __shfl_xor_sync(unsigned mask, int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor_sync(unsigned mask, unsigned int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_sync(unsigned mask, float var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_up_sync(unsigned mask, float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_down_sync(unsigned mask, float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_xor_sync(unsigned mask, float var, int laneMask, int width=warpSize) __DEF_IF_HOST
+
+// 64-bits SHFL
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned long long __shfl(unsigned long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) long long __shfl(long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) long long __shfl_up(long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned long long __shfl_up(unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) long long __shfl_down(long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned long long __shfl_down(unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) long long __shfl_xor(long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned long long __shfl_xor(unsigned long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) double __shfl(double var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) double __shfl_up(double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) double __shfl_down(double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) double __shfl_xor(double var, int laneMask, int width=warpSize) __DEF_IF_HOST
+#endif
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_sync(unsigned mask, long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_sync(unsigned mask, unsigned long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long long __shfl_up_sync(unsigned mask, long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up_sync(unsigned mask, unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long long __shfl_down_sync(unsigned mask, long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down_sync(unsigned mask, unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long long __shfl_xor_sync(unsigned mask, long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor_sync(unsigned mask, unsigned long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_sync(unsigned mask, double var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_up_sync(unsigned mask, double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_down_sync(unsigned mask, double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_xor_sync(unsigned mask, double var, int laneMask, int width=warpSize) __DEF_IF_HOST
+
+// long needs some help to choose between 32-bits and 64-bits
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) long __shfl(long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned long __shfl(unsigned long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) long __shfl_up(long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned long __shfl_up(unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) long __shfl_down(long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned long __shfl_down(unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) long __shfl_xor(long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned long __shfl_xor(unsigned long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+#endif
+
+__SM_30_INTRINSICS_DECL__ long __shfl_sync(unsigned mask, long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_sync(unsigned mask, unsigned long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long __shfl_up_sync(unsigned mask, long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up_sync(unsigned mask, unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long __shfl_down_sync(unsigned mask, long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down_sync(unsigned mask, unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long __shfl_xor_sync(unsigned mask, long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor_sync(unsigned mask, unsigned long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+
+#undef __DEPRECATED__
+#undef __WSB_DEPRECATION_MESSAGE
+
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 300 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_30_INTRINSICS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_30_intrinsics.hpp"
+#endif /* !__CUDACC_RTC__ && __CUDA_ARCH__ */
+
+#endif /* !__SM_30_INTRINSICS_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_30_intrinsics.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_30_intrinsics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5bcac5ee68c0cf547e4de7c08badf37106639dc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_30_intrinsics.hpp
@@ -0,0 +1,604 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_30_INTRINSICS_HPP__)
+#define __SM_30_INTRINSICS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_30_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_30_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+// In here are intrinsics which are built in to the compiler. These may be
+// referenced by intrinsic implementations from this file.
+extern "C"
+{
+}
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-3.0 intrinsics which are included as        *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+__SM_30_INTRINSICS_DECL__
+unsigned __fns(unsigned mask, unsigned base, int offset) {
+  extern __device__ __device_builtin__ unsigned int __nvvm_fns(unsigned int mask, unsigned int base, int offset);
+  return __nvvm_fns(mask, base, offset);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+void  __barrier_sync(unsigned id) {
+  extern __device__ __device_builtin__ void __nvvm_barrier_sync(unsigned id);
+  return __nvvm_barrier_sync(id);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+void  __barrier_sync_count(unsigned id, unsigned cnt) {
+  extern __device__ __device_builtin__ void __nvvm_barrier_sync_cnt(unsigned id, unsigned cnt);
+  return __nvvm_barrier_sync_cnt(id, cnt);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+void  __syncwarp(unsigned mask) {
+  extern __device__ __device_builtin__ void __nvvm_bar_warp_sync(unsigned mask);
+  return __nvvm_bar_warp_sync(mask);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+int __all_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ int __nvvm_vote_all_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_all_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+int __any_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ int __nvvm_vote_any_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_any_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+int __uni_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ int __nvvm_vote_uni_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_uni_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+unsigned __ballot_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ unsigned int __nvvm_vote_ballot_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_ballot_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__
+unsigned __activemask() {
+    unsigned ret;
+    asm volatile ("activemask.b32 %0;" : "=r"(ret));
+    return ret;
+}
+
+// These are removed starting with compute_70 and onwards
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+
+__SM_30_INTRINSICS_DECL__ int __shfl(int var, int srcLane, int width) {
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(srcLane), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl(unsigned int var, int srcLane, int width) {
+	return (unsigned int) __shfl((int)var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_up(int var, unsigned int delta, int width) {
+	int ret;
+	int c = (warpSize-width) << 8;
+	asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up(unsigned int var, unsigned int delta, int width) {
+	return (unsigned int) __shfl_up((int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_down(int var, unsigned int delta, int width) {
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down(unsigned int var, unsigned int delta, int width) {
+	return (unsigned int) __shfl_down((int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_xor(int var, int laneMask, int width) {
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(laneMask), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor(unsigned int var, int laneMask, int width) {
+	return (unsigned int) __shfl_xor((int)var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl(float var, int srcLane, int width) {
+	float ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(srcLane), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_up(float var, unsigned int delta, int width) {
+	float ret;
+        int c;
+	c = (warpSize-width) << 8;
+	asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_down(float var, unsigned int delta, int width) {
+	float ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_xor(float var, int laneMask, int width) {
+	float ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
+	return ret;
+}
+
+// 64-bits SHFL
+
+__SM_30_INTRINSICS_DECL__ long long __shfl(long long var, int srcLane, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl(hi, srcLane, width);
+	lo = __shfl(lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl(unsigned long long var, int srcLane, int width) {
+	return (unsigned long long) __shfl((long long) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_up(long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_up(hi, delta, width);
+	lo = __shfl_up(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up(unsigned long long var, unsigned int delta, int width) {
+	return (unsigned long long) __shfl_up((long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_down(long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_down(hi, delta, width);
+	lo = __shfl_down(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down(unsigned long long var, unsigned int delta, int width) {
+	return (unsigned long long) __shfl_down((long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_xor(long long var, int laneMask, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_xor(hi, laneMask, width);
+	lo = __shfl_xor(lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor(unsigned long long var, int laneMask, int width) {
+	return (unsigned long long) __shfl_xor((long long) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl(double var, int srcLane, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl(hi, srcLane, width);
+	lo = __shfl(lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_up(double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_up(hi, delta, width);
+	lo = __shfl_up(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_down(double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_down(hi, delta, width);
+	lo = __shfl_down(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_xor(double var, int laneMask, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_xor(hi, laneMask, width);
+	lo = __shfl_xor(lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl(long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl((long long) var, srcLane, width) :
+		__shfl((int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl(unsigned long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl((unsigned long long) var, srcLane, width) :
+		__shfl((unsigned int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_up(long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up((long long) var, delta, width) :
+		__shfl_up((int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up(unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up((unsigned long long) var, delta, width) :
+		__shfl_up((unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_down(long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down((long long) var, delta, width) :
+		__shfl_down((int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down(unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down((unsigned long long) var, delta, width) :
+		__shfl_down((unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_xor(long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor((long long) var, laneMask, width) :
+		__shfl_xor((int) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor(unsigned long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor((unsigned long long) var, laneMask, width) :
+		__shfl_xor((unsigned int) var, laneMask, width);
+}
+
+#endif /* defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
+
+// Warp register exchange (shuffle) intrinsics.
+// Notes:
+// a) Warp size is hardcoded to 32 here, because the compiler does not know
+//    the "warpSize" constant at this time
+// b) we cannot map the float __shfl to the int __shfl because it'll mess with
+//    the register number (especially if you're doing two shfls to move a double).
+__SM_30_INTRINSICS_DECL__ int __shfl_sync(unsigned mask, int var, int srcLane, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_idx_sync(mask, var, srcLane, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_sync(unsigned mask, unsigned int var, int srcLane, int width) {
+        return (unsigned int) __shfl_sync(mask, (int)var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_up_sync(unsigned mask, int var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = (warpSize-width) << 8;
+        ret = __nvvm_shfl_up_sync(mask, var, delta, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up_sync(unsigned mask, unsigned int var, unsigned int delta, int width) {
+        return (unsigned int) __shfl_up_sync(mask, (int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_down_sync(unsigned mask, int var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_down_sync(mask, var, delta, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down_sync(unsigned mask, unsigned int var, unsigned int delta, int width) {
+        return (unsigned int) __shfl_down_sync(mask, (int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_xor_sync(unsigned mask, int var, int laneMask, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_bfly_sync(mask, var, laneMask, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor_sync(unsigned mask, unsigned int var, int laneMask, int width) {
+	return (unsigned int) __shfl_xor_sync(mask, (int)var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_sync(unsigned mask, float var, int srcLane, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+        int ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_idx_sync(mask, __float_as_int(var), srcLane, c);
+	return __int_as_float(ret);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_up_sync(unsigned mask, float var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+        int c;
+	c = (warpSize-width) << 8;
+        ret = __nvvm_shfl_up_sync(mask, __float_as_int(var), delta, c);
+	return __int_as_float(ret);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_down_sync(unsigned mask, float var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_down_sync(mask, __float_as_int(var), delta, c);
+	return __int_as_float(ret);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_xor_sync(unsigned mask, float var, int laneMask, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_bfly_sync(mask, __float_as_int(var), laneMask, c);
+	return __int_as_float(ret);
+}
+
+// 64-bits SHFL
+__SM_30_INTRINSICS_DECL__ long long __shfl_sync(unsigned mask, long long var, int srcLane, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_sync(mask, hi, srcLane, width);
+	lo = __shfl_sync(mask, lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_sync(unsigned mask, unsigned long long var, int srcLane, int width) {
+        return (unsigned long long) __shfl_sync(mask, (long long) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_up_sync(unsigned mask, long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_up_sync(mask, hi, delta, width);
+	lo = __shfl_up_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) {
+        return (unsigned long long) __shfl_up_sync(mask, (long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_down_sync(unsigned mask, long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_down_sync(mask, hi, delta, width);
+	lo = __shfl_down_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) {
+        return (unsigned long long) __shfl_down_sync(mask, (long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_xor_sync(unsigned mask, long long var, int laneMask, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_xor_sync(mask, hi, laneMask, width);
+	lo = __shfl_xor_sync(mask, lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor_sync(unsigned mask, unsigned long long var, int laneMask, int width) {
+        return (unsigned long long) __shfl_xor_sync(mask, (long long) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_sync(unsigned mask, double var, int srcLane, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_sync(mask, hi, srcLane, width);
+	lo = __shfl_sync(mask, lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_up_sync(unsigned mask, double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_up_sync(mask, hi, delta, width);
+	lo = __shfl_up_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_down_sync(unsigned mask, double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_down_sync(mask, hi, delta, width);
+	lo = __shfl_down_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_xor_sync(unsigned mask, double var, int laneMask, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_xor_sync(mask, hi, laneMask, width);
+	lo = __shfl_xor_sync(mask, lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+// long needs some help to choose between 32-bits and 64-bits
+
+__SM_30_INTRINSICS_DECL__ long __shfl_sync(unsigned mask, long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+                __shfl_sync(mask, (long long) var, srcLane, width) :
+		__shfl_sync(mask, (int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_sync(unsigned mask, unsigned long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+                __shfl_sync(mask, (unsigned long long) var, srcLane, width) :
+		__shfl_sync(mask, (unsigned int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_up_sync(unsigned mask, long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up_sync(mask, (long long) var, delta, width) :
+		__shfl_up_sync(mask, (int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up_sync(unsigned mask, unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up_sync(mask, (unsigned long long) var, delta, width) :
+		__shfl_up_sync(mask, (unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_down_sync(unsigned mask, long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down_sync(mask, (long long) var, delta, width) :
+		__shfl_down_sync(mask, (int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down_sync(unsigned mask, unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down_sync(mask, (unsigned long long) var, delta, width) :
+		__shfl_down_sync(mask, (unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_xor_sync(unsigned mask, long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor_sync(mask, (long long) var, laneMask, width) :
+		__shfl_xor_sync(mask, (int) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor_sync(unsigned mask, unsigned long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor_sync(mask, (unsigned long long) var, laneMask, width) :
+		__shfl_xor_sync(mask, (unsigned int) var, laneMask, width);
+}
+
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+
+#endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 300 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_30_INTRINSICS_DECL__
+
+#endif /* !__SM_30_INTRINSICS_HPP__ */
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_atomic_functions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2070bc8bbfc0c5aa58c45ef1d28623d91f4e938
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_atomic_functions.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.35.235 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__SM_32_ATOMIC_FUNCTIONS_H__)
+#define __SM_32_ATOMIC_FUNCTIONS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ __device__
+#elif defined(_NVHPC_CUDA)
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ extern __device__ __cudart_builtin__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMin(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMax(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicAnd(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicOr(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicXor(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_32_ATOMIC_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_32_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__  && defined(__CUDA_ARCH__) */
+
+#endif /* !__SM_32_ATOMIC_FUNCTIONS_H__ */
+
+#undef EXCLUDE_FROM_RTC
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f493c1c54c7715703d09c70eb78b70b60d208d9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.h
@@ -0,0 +1,516 @@
+/*
+ * Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_32_INTRINSICS_H__)
+#define __SM_32_INTRINSICS_H__
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if defined(__CUDACC_RTC__)
+#define __SM_32_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST ;
+#else  /* defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA) */
+#define __DEF_IF_HOST { }
+#endif /* defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA) */
+
+
+/*******************************************************************************
+*                                                                              *
+*  Below are declarations of SM-3.5 intrinsics which are included as           *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+/******************************************************************************
+ *                                   __ldg                                    *
+ ******************************************************************************/
+
+__SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldg(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldg(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldg(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldg(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldg(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldg(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldg(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldg(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldg(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldg(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldg(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldg(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldg(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldg(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldg(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldg(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldg(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldg(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldg(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldg(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldg(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldg(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldg(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldg(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldg(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldg(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldg(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldg(const double2 *ptr) __DEF_IF_HOST
+
+/******************************************************************************
+ *                                   __ldcg                                   *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldcg(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldcg(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldcg(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldcg(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldcg(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldcg(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldcg(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldcg(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldcg(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldcg(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldcg(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcg(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcg(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcg(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcg(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcg(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcg(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcg(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcg(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcg(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldcg(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldcg(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcg(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldcg(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldcg(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldcg(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldcg(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldcg(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __ldca                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldca(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldca(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldca(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldca(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldca(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldca(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldca(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldca(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldca(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldca(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldca(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldca(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldca(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldca(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldca(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldca(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldca(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldca(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldca(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldca(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldca(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldca(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldca(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldca(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldca(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldca(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldca(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldca(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __ldcs                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldcs(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldcs(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldcs(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldcs(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldcs(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldcs(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldcs(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldcs(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldcs(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldcs(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldcs(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcs(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcs(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcs(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcs(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcs(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcs(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcs(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcs(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcs(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldcs(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldcs(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcs(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldcs(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldcs(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldcs(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldcs(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldcs(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __ldlu                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldlu(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldlu(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldlu(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldlu(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldlu(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldlu(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldlu(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldlu(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldlu(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldlu(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldlu(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldlu(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldlu(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldlu(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldlu(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldlu(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldlu(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldlu(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldlu(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldlu(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldlu(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldlu(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldlu(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldlu(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldlu(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldlu(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldlu(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldlu(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __ldcv                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldcv(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldcv(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldcv(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldcv(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldcv(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldcv(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldcv(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldcv(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldcv(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldcv(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldcv(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcv(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcv(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcv(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcv(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcv(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcv(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcv(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcv(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcv(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldcv(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldcv(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcv(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldcv(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldcv(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldcv(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldcv(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldcv(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __stwb                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwb(char *ptr, char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(signed char *ptr, signed char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(short *ptr, short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(int *ptr, int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(long long *ptr, long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(char2 *ptr, char2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(char4 *ptr, char4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(short2 *ptr, short2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(short4 *ptr, short4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(int2 *ptr, int2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(int4 *ptr, int4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(longlong2 *ptr, longlong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned char *ptr, unsigned char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned short *ptr, unsigned short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned int *ptr, unsigned int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(uchar2 *ptr, uchar2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(uchar4 *ptr, uchar4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(ushort2 *ptr, ushort2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(ushort4 *ptr, ushort4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(uint2 *ptr, uint2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(uint4 *ptr, uint4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwb(float *ptr, float value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(double *ptr, double value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(float2 *ptr, float2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(float4 *ptr, float4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(double2 *ptr, double2 value) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __stcg                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcg(char *ptr, char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(signed char *ptr, signed char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(short *ptr, short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(int *ptr, int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(long long *ptr, long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(char2 *ptr, char2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(char4 *ptr, char4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(short2 *ptr, short2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(short4 *ptr, short4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(int2 *ptr, int2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(int4 *ptr, int4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(longlong2 *ptr, longlong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned char *ptr, unsigned char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned short *ptr, unsigned short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned int *ptr, unsigned int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(uchar2 *ptr, uchar2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(uchar4 *ptr, uchar4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(ushort2 *ptr, ushort2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(ushort4 *ptr, ushort4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(uint2 *ptr, uint2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(uint4 *ptr, uint4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcg(float *ptr, float value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(double *ptr, double value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(float2 *ptr, float2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(float4 *ptr, float4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(double2 *ptr, double2 value) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __stcs                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcs(char *ptr, char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(signed char *ptr, signed char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(short *ptr, short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(int *ptr, int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(long long *ptr, long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(char2 *ptr, char2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(char4 *ptr, char4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(short2 *ptr, short2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(short4 *ptr, short4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(int2 *ptr, int2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(int4 *ptr, int4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(longlong2 *ptr, longlong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned char *ptr, unsigned char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned short *ptr, unsigned short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned int *ptr, unsigned int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(uchar2 *ptr, uchar2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(uchar4 *ptr, uchar4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(ushort2 *ptr, ushort2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(ushort4 *ptr, ushort4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(uint2 *ptr, uint2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(uint4 *ptr, uint4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcs(float *ptr, float value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(double *ptr, double value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(float2 *ptr, float2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(float4 *ptr, float4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(double2 *ptr, double2 value) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __stwt                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwt(char *ptr, char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(signed char *ptr, signed char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(short *ptr, short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(int *ptr, int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(long long *ptr, long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(char2 *ptr, char2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(char4 *ptr, char4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(short2 *ptr, short2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(short4 *ptr, short4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(int2 *ptr, int2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(int4 *ptr, int4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(longlong2 *ptr, longlong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned char *ptr, unsigned char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned short *ptr, unsigned short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned int *ptr, unsigned int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(uchar2 *ptr, uchar2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(uchar4 *ptr, uchar4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(ushort2 *ptr, ushort2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(ushort4 *ptr, ushort4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(uint2 *ptr, uint2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(uint4 *ptr, uint4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwt(float *ptr, float value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(double *ptr, double value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(float2 *ptr, float2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(float4 *ptr, float4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(double2 *ptr, double2 value) __DEF_IF_HOST
+
+
+// SHF is the "funnel shift" operation - an accelerated left/right shift with carry
+// operating on 64-bit quantities, which are concatenations of two 32-bit registers.
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Concatenate \p hi : \p lo, shift left by \p shift & 31 bits, return the most significant 32 bits.
+ *
+ * Shift the 64-bit value formed by concatenating argument \p lo and \p hi left by the amount specified by the argument \p shift.
+ * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value.
+ * The source is shifted left by the wrapped value of \p shift (\p shift & 31).
+ * The most significant 32-bits of the result are returned.
+ *
+ * \return Returns the most significant 32 bits of the shifted 64-bit value.
+ */
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Concatenate \p hi : \p lo, shift left by min(\p shift, 32) bits, return the most significant 32 bits.
+ *
+ * Shift the 64-bit value formed by concatenating argument \p lo and \p hi left by the amount specified by the argument \p shift.
+ * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value.
+ * The source is shifted left by the clamped value of \p shift (min(\p shift, 32)).
+ * The most significant 32-bits of the result are returned.
+ *
+ * \return Returns the most significant 32 bits of the shifted 64-bit value.
+ */
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Concatenate \p hi : \p lo, shift right by \p shift & 31 bits, return the least significant 32 bits.
+ *
+ * Shift the 64-bit value formed by concatenating argument \p lo and \p hi right by the amount specified by the argument \p shift.
+ * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value.
+ * The source is shifted right by the wrapped value of \p shift (\p shift & 31).
+ * The least significant 32-bits of the result are returned.
+ *
+ * \return Returns the least significant 32 bits of the shifted 64-bit value.
+ */
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Concatenate \p hi : \p lo, shift right by min(\p shift, 32) bits, return the least significant 32 bits.
+ *
+ * Shift the 64-bit value formed by concatenating argument \p lo and \p hi right by the amount specified by the argument \p shift.
+ * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value.
+ * The source is shifted right by the clamped value of \p shift (min(\p shift, 32)).
+ * The least significant 32-bits of the result are returned.
+ *
+ * \return Returns the least significant 32 bits of the shifted 64-bit value.
+ */
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST
+
+
+#endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_32_INTRINSICS_DECL__
+
+#if !defined(__CUDACC_RTC__) && (defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA))
+#include "sm_32_intrinsics.hpp"
+#endif /* !defined(__CUDACC_RTC__) && (defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA))  */
+
+#undef EXCLUDE_FROM_RTC
+#endif /* !__SM_32_INTRINSICS_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_35_atomic_functions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_35_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8961079aeac4c9e73a7c2825cf9ea10b171af09
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_35_atomic_functions.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.35.235 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_35_ATOMIC_FUNCTIONS_H__)
+#define __SM_35_ATOMIC_FUNCTIONS_H__
+
+/*******************************************************************************
+* All sm_35 atomics are supported by sm_32 so simply include its header file   *
+*******************************************************************************/
+#include "sm_32_atomic_functions.h"
+
+#endif /* !__SM_35_ATOMIC_FUNCTIONS_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_35_intrinsics.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_35_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..a13a4fbb0133ea5ed9f2fcc317292ae3fe5397af
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_35_intrinsics.h
@@ -0,0 +1,106 @@
+/*
+
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+
+ *
+
+ * NOTICE TO LICENSEE:
+
+ *
+
+ * This source code and/or documentation ("Licensed Deliverables") are
+
+ * subject to NVIDIA intellectual property rights under U.S. and
+
+ * international Copyright laws.
+
+ *
+
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+
+ * conditions of a form of NVIDIA software license agreement by and
+
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+
+ * the contrary in the License Agreement, reproduction or disclosure
+
+ * of the Licensed Deliverables to any third party without the express
+
+ * written consent of NVIDIA is prohibited.
+
+ *
+
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+
+ * OF THESE LICENSED DELIVERABLES.
+
+ *
+
+ * U.S. Government End Users.  These Licensed Deliverables are a
+
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+
+ * 1995), consisting of "commercial computer software" and "commercial
+
+ * computer software documentation" as such terms are used in 48
+
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+
+ * U.S. Government End Users acquire the Licensed Deliverables with
+
+ * only those rights set forth herein.
+
+ *
+
+ * Any use of the Licensed Deliverables in individual and commercial
+
+ * software must include, in the user documentation and internal
+
+ * comments to the code, the above Disclaimer and U.S. Government End
+
+ * Users Notice.
+
+ */
+
+
+
+#if !defined(__SM_35_INTRINSICS_H__)
+#define __SM_35_INTRINSICS_H__
+
+
+
+
+#endif /* !__SM_35_INTRINSICS_H__ */
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.hpp b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d6ac004cd92d3af9281143123289bc2353dd494
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.hpp
@@ -0,0 +1,742 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_60_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_60_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+extern "C"
+{
+extern __device__ __device_builtin__ double __dAtomicAdd(double *address, double val);
+
+extern __device__ __device_builtin__
+int __iAtomicAdd_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicAdd_system(int *address, int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAdd_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAdd_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAdd_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAdd_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+float __fAtomicAdd_block(float *address, float val);
+
+extern __device__ __device_builtin__
+float __fAtomicAdd_system(float *address, float val);
+
+extern __device__ __device_builtin__
+double __dAtomicAdd_block(double *address, double val);
+
+extern __device__ __device_builtin__
+double __dAtomicAdd_system(double *address, double val);
+
+extern __device__ __device_builtin__
+int __iAtomicExch_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicExch_system(int *address, int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicExch_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicExch_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicExch_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicExch_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+float __fAtomicExch_block(float *address, float val);
+
+extern __device__ __device_builtin__
+float __fAtomicExch_system(float *address, float val);
+
+extern __device__ __device_builtin__
+int __iAtomicMin_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicMin_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMin_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMin_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMin_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMin_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMin_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMin_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+int __iAtomicMax_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicMax_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMax_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMax_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMax_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMax_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMax_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMax_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicInc_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicInc_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicDec_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicDec_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+int __iAtomicCAS_block(int *address, int compare, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicCAS_system(int *address, int compare, int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicCAS_block(unsigned int *address, unsigned int compare,
+                                unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicCAS_system(unsigned int *address, unsigned int compare,
+                                 unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicCAS_block(unsigned long long int *address,
+                                        unsigned long long int compare,
+                                        unsigned long long int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicCAS_system(unsigned long long int *address,
+                                         unsigned long long int compare,
+                                         unsigned long long int val);
+
+extern __device__ __device_builtin__
+int __iAtomicAnd_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicAnd_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __llAtomicAnd_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __llAtomicAnd_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAnd_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAnd_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAnd_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAnd_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+int __iAtomicOr_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicOr_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __llAtomicOr_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __llAtomicOr_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicOr_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicOr_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicOr_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicOr_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+int __iAtomicXor_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicXor_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __llAtomicXor_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __llAtomicXor_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicXor_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicXor_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicXor_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicXor_system(unsigned long long *address, unsigned long long val);
+}
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val)
+{
+  return __dAtomicAdd(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_block(int *address, int val)
+{
+  return __iAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_system(int *address, int val)
+{
+  return __iAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_block(float *address, float val)
+{
+  return __fAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_system(float *address, float val)
+{
+  return __fAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_block(double *address, double val)
+{
+  return __dAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_system(double *address, double val)
+{
+  return __dAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_block(int *address, int val)
+{
+  return __iAtomicAdd_block(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_system(int *address, int val)
+{
+  return __iAtomicAdd_system(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_block(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_system(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_block(int *address, int val)
+{
+  return __iAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_system(int *address, int val)
+{
+  return __iAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_block(float *address, float val)
+{
+  return __fAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_system(float *address, float val)
+{
+  return __fAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_block(int *address, int val)
+{
+  return __iAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_system(int *address, int val)
+{
+  return __iAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_block(long long *address, long long val)
+{
+  return __illAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_system(long long *address, long long val)
+{
+  return __illAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_block(int *address, int val)
+{
+  return __iAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_system(int *address, int val)
+{
+  return __iAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_block(long long *address, long long val)
+{
+  return __illAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_system(long long *address, long long val)
+{
+  return __illAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_block(int *address, int compare, int val)
+{
+  return __iAtomicCAS_block(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_system(int *address, int compare, int val)
+{
+  return __iAtomicCAS_system(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_block(unsigned int *address, unsigned int compare,
+                             unsigned int val)
+{
+  return __uAtomicCAS_block(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_system(unsigned int *address, unsigned int compare,
+                              unsigned int val)
+{
+  return __uAtomicCAS_system(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_block(unsigned long long int *address,
+                                       unsigned long long int compare,
+                                       unsigned long long int val)
+{
+  return __ullAtomicCAS_block(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_system(unsigned long long int *address,
+                                        unsigned long long int compare,
+                                        unsigned long long int val)
+{
+  return __ullAtomicCAS_system(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_block(int *address, int val)
+{
+  return __iAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_system(int *address, int val)
+{
+  return __iAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_block(long long *address, long long val)
+{
+  return __llAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_system(long long *address, long long val)
+{
+  return __llAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_block(int *address, int val)
+{
+  return __iAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_system(int *address, int val)
+{
+  return __iAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_block(long long *address, long long val)
+{
+  return __llAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_system(long long *address, long long val)
+{
+  return __llAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_block(int *address, int val)
+{
+  return __iAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_system(int *address, int val)
+{
+  return __iAtomicXor_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_block(long long *address, long long val)
+{
+  return __llAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_system(long long *address, long long val)
+{
+  return __llAtomicXor_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicXor_system(address, val);
+}
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_60_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__SM_60_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..40dbe98ac42483b53f96d27280e621608ca24094
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.h
@@ -0,0 +1,239 @@
+/*
+ * Copyright 2016-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+
+#if !defined(__SM_61_INTRINSICS_H__)
+#define __SM_61_INTRINSICS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_61_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ && !_NVHPC_CUDA */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ || _NVHPC_CUDA */
+
+/*******************************************************************************
+*                                                                              *
+*  Below are declarations of SM-6.1 intrinsics which are included as           *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+
+/******************************************************************************
+ *                                   __dp2a                                   *
+ ******************************************************************************/
+// Generic [_lo]
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p signed \p int16 by \p int8 dot product with \p int32 accumulate,
+ * taking the lower half of the second input.
+ *
+ * \details Extracts two packed 16-bit integers from \p scrA
+ * and two packed 8-bit integers from the lower 16 bits of \p srcB,
+ * then creates two pairwise 8x16 products and adds them together
+ * to a signed 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p unsigned \p int16 by \p int8 dot product with
+ * \p unsigned \p int32 accumulate, taking the lower half of the second input.
+ *
+ * \details Extracts two packed 16-bit integers from \p scrA
+ * and two packed 8-bit integers from the lower 16 bits of \p srcB,
+ * then creates two pairwise 8x16 products and adds them together
+ * to an unsigned 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
+// Vector-style [_lo]
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p signed \p int16 by \p int8 dot product with \p int32 accumulate,
+ * taking the lower half of the second input.
+ *
+ * \details Takes two packed 16-bit integers from \p scrA vector
+ * and two packed 8-bit integers from the lower 16 bits of \p srcB vector,
+ * then creates two pairwise 8x16 products and adds them together
+ * to a signed 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p unsigned \p int16 by \p int8 dot product with
+ * \p unsigned \p int32 accumulate, taking the lower half of the second input.
+ *
+ * \details Takes two packed 16-bit integers from \p scrA vector
+ * and two packed 8-bit integers from the lower 16 bits of \p srcB vector,
+ * then creates two pairwise 8x16 products and adds them together
+ * to an unsigned 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
+// Generic [_hi]
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p signed \p int16 by \p int8 dot product with \p int32 accumulate,
+ * taking the upper half of the second input.
+ *
+ * \details Extracts two packed 16-bit integers from \p scrA
+ * and two packed 8-bit integers from the upper 16 bits of \p srcB,
+ * then creates two pairwise 8x16 products and adds them together
+ * to a signed 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p unsigned \p int16 by \p int8 dot product with
+ * \p unsigned \p int32 accumulate, taking the upper half of the second input.
+ *
+ * \details Extracts two packed 16-bit integers from \p scrA
+ * and two packed 8-bit integers from the upper 16 bits of \p srcB,
+ * then creates two pairwise 8x16 products and adds them together
+ * to an unsigned 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
+// Vector-style [_hi]
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p signed \p int16 by \p int8 dot product with \p int32 accumulate,
+ * taking the upper half of the second input.
+ *
+ * \details Takes two packed 16-bit integers from \p scrA vector
+ * and two packed 8-bit integers from the upper 16 bits of \p srcB vector,
+ * then creates two pairwise 8x16 products and adds them together
+ * to a signed 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Two-way \p unsigned \p int16 by \p int8 dot product with
+ * \p unsigned \p int32 accumulate, taking the upper half of the second input.
+ *
+ * \details Takes two packed 16-bit integers from \p scrA vector
+ * and two packed 8-bit integers from the upper 16 bits of \p srcB vector,
+ * then creates two pairwise 8x16 products and adds them together
+ * to an unsigned 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
+
+
+/******************************************************************************
+ *                                   __dp4a                                   *
+ ******************************************************************************/
+// Generic
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Four-way \p signed \p int8 dot product with \p int32 accumulate.
+ *
+ * \details Extracts four pairs of packed byte-sized integers from \p scrA
+ * and \p srcB, then creates four pairwise products and adds them together
+ * to a signed 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Four-way \p unsigned \p int8 dot product with \p unsigned \p int32 accumulate.
+ *
+ * \details Extracts four pairs of packed byte-sized integers from \p scrA
+ * and \p srcB, then creates four pairwise products and adds them together
+ * to an unsigned 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
+// Vector-style
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Four-way \p signed \p int8 dot product with \p int32 accumulate.
+ *
+ * \details Takes four pairs of packed byte-sized integers from \p scrA
+ * and \p srcB vectors, then creates four pairwise products and adds them
+ * together to a signed 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Four-way \p unsigned \p int8 dot product with \p unsigned \p int32 accumulate.
+ *
+ * \details Takes four pairs of packed byte-sized integers from \p scrA
+ * and \p srcB vectors, then creates four pairwise products and adds them
+ * together to an unsigned 32-bit integer \p c.
+ */
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
+
+#endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_61_INTRINSICS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_61_intrinsics.hpp"
+#endif /* !__CUDACC_RTC__ && __CUDA_ARCH__ */
+
+#endif /* !__SM_61_INTRINSICS_H__ */
+#undef EXCLUDE_FROM_RTC
\ No newline at end of file
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/surface_functions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/surface_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fb940c1d2bd5ee7b4a5020e12297bc2927e0386
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/surface_functions.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SURFACE_FUNCTIONS_H__)
+#define __SURFACE_FUNCTIONS_H__
+
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+#include "cuda_surface_types.h"
+
+#if defined(_WIN32)
+# define __DEPRECATED__ __declspec(deprecated)
+#else
+# define __DEPRECATED__  __attribute__((deprecated))
+#endif
+
+template <typename T> struct __nv_surf_trait {  typedef void * cast_type; };
+
+template<> struct __nv_surf_trait<char> {  typedef char * cast_type; };
+template<> struct __nv_surf_trait<signed char> {  typedef signed char * cast_type; };
+template<> struct __nv_surf_trait<unsigned char> {  typedef unsigned char * cast_type; };
+template<> struct __nv_surf_trait<char1> {  typedef char1 * cast_type; };
+template<> struct __nv_surf_trait<uchar1> {  typedef uchar1 * cast_type; };
+template<> struct __nv_surf_trait<char2> {  typedef char2 * cast_type; };
+template<> struct __nv_surf_trait<uchar2> {  typedef uchar2 * cast_type; };
+template<> struct __nv_surf_trait<char4> {  typedef char4 * cast_type; };
+template<> struct __nv_surf_trait<uchar4> {  typedef uchar4 * cast_type; };
+template<> struct __nv_surf_trait<short> {  typedef short * cast_type; };
+template<> struct __nv_surf_trait<unsigned short> {  typedef unsigned short * cast_type; };
+template<> struct __nv_surf_trait<short1> {  typedef short1 * cast_type; };
+template<> struct __nv_surf_trait<ushort1> {  typedef ushort1 * cast_type; };
+template<> struct __nv_surf_trait<short2> {  typedef short2 * cast_type; };
+template<> struct __nv_surf_trait<ushort2> {  typedef ushort2 * cast_type; };
+template<> struct __nv_surf_trait<short4> {  typedef short4 * cast_type; };
+template<> struct __nv_surf_trait<ushort4> {  typedef ushort4 * cast_type; };
+template<> struct __nv_surf_trait<int> {  typedef int * cast_type; };
+template<> struct __nv_surf_trait<unsigned int> {  typedef unsigned int * cast_type; };
+template<> struct __nv_surf_trait<int1> {  typedef int1 * cast_type; };
+template<> struct __nv_surf_trait<uint1> {  typedef uint1 * cast_type; };
+template<> struct __nv_surf_trait<int2> {  typedef int2 * cast_type; };
+template<> struct __nv_surf_trait<uint2> {  typedef uint2 * cast_type; };
+template<> struct __nv_surf_trait<int4> {  typedef int4 * cast_type; };
+template<> struct __nv_surf_trait<uint4> {  typedef uint4 * cast_type; };
+template<> struct __nv_surf_trait<long long> {  typedef long long * cast_type; };
+template<> struct __nv_surf_trait<unsigned long long> {  typedef unsigned long long * cast_type; };
+template<> struct __nv_surf_trait<longlong1> {  typedef longlong1 * cast_type; };
+template<> struct __nv_surf_trait<ulonglong1> {  typedef ulonglong1 * cast_type; };
+template<> struct __nv_surf_trait<longlong2> {  typedef longlong2 * cast_type; };
+template<> struct __nv_surf_trait<ulonglong2> {  typedef ulonglong2 * cast_type; };
+#if !defined(__LP64__)
+template<> struct __nv_surf_trait<long> {  typedef int * cast_type; };
+template<> struct __nv_surf_trait<unsigned long> {  typedef unsigned int * cast_type; };
+template<> struct __nv_surf_trait<long1> {  typedef int1 * cast_type; };
+template<> struct __nv_surf_trait<ulong1> {  typedef uint1 * cast_type; };
+template<> struct __nv_surf_trait<long2> {  typedef int2 * cast_type; };
+template<> struct __nv_surf_trait<ulong2> {  typedef uint2 * cast_type; };
+template<> struct __nv_surf_trait<long4> {  typedef uint4 * cast_type; };
+template<> struct __nv_surf_trait<ulong4> {  typedef int4 * cast_type; };
+#endif
+template<> struct __nv_surf_trait<float> {  typedef float * cast_type; };
+template<> struct __nv_surf_trait<float1> {  typedef float1 * cast_type; };
+template<> struct __nv_surf_trait<float2> {  typedef float2 * cast_type; };
+template<> struct __nv_surf_trait<float4> {  typedef float4 * cast_type; };
+
+
+#undef __DEPRECATED__
+
+
+#endif /* __cplusplus && __CUDACC__ */
+#endif /* !__SURFACE_FUNCTIONS_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/texture_indirect_functions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/texture_indirect_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e5537d87294ee78ecec567893a6aaec333db317
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/texture_indirect_functions.h
@@ -0,0 +1,638 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+
+#ifndef __TEXTURE_INDIRECT_FUNCTIONS_H__
+#define __TEXTURE_INDIRECT_FUNCTIONS_H__
+
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+
+#include "cuda_runtime_api.h"
+
+
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
+#define __NV_TEX_SPARSE 1
+#endif  /* endif */
+
+template <typename T> struct __nv_itex_trait {   };
+template<> struct __nv_itex_trait<char> { typedef void type; };
+template<> struct __nv_itex_trait<signed char> { typedef void type; };
+template<> struct __nv_itex_trait<char1> { typedef void type; };
+template<> struct __nv_itex_trait<char2> { typedef void type; };
+template<> struct __nv_itex_trait<char4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned char> { typedef void type; };
+template<> struct __nv_itex_trait<uchar1> { typedef void type; };
+template<> struct __nv_itex_trait<uchar2> { typedef void type; };
+template<> struct __nv_itex_trait<uchar4> { typedef void type; };
+template<> struct __nv_itex_trait<short> { typedef void type; };
+template<> struct __nv_itex_trait<short1> { typedef void type; };
+template<> struct __nv_itex_trait<short2> { typedef void type; };
+template<> struct __nv_itex_trait<short4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned short> { typedef void type; };
+template<> struct __nv_itex_trait<ushort1> { typedef void type; };
+template<> struct __nv_itex_trait<ushort2> { typedef void type; };
+template<> struct __nv_itex_trait<ushort4> { typedef void type; };
+template<> struct __nv_itex_trait<int> { typedef void type; };
+template<> struct __nv_itex_trait<int1> { typedef void type; };
+template<> struct __nv_itex_trait<int2> { typedef void type; };
+template<> struct __nv_itex_trait<int4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned int> { typedef void type; };
+template<> struct __nv_itex_trait<uint1> { typedef void type; };
+template<> struct __nv_itex_trait<uint2> { typedef void type; };
+template<> struct __nv_itex_trait<uint4> { typedef void type; };
+#if !defined(__LP64__)
+template<> struct __nv_itex_trait<long> { typedef void type; };
+template<> struct __nv_itex_trait<long1> { typedef void type; };
+template<> struct __nv_itex_trait<long2> { typedef void type; };
+template<> struct __nv_itex_trait<long4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned long> { typedef void type; };
+template<> struct __nv_itex_trait<ulong1> { typedef void type; };
+template<> struct __nv_itex_trait<ulong2> { typedef void type; };
+template<> struct __nv_itex_trait<ulong4> { typedef void type; };
+#endif /* !__LP64__ */
+template<> struct __nv_itex_trait<float> { typedef void type; };
+template<> struct __nv_itex_trait<float1> { typedef void type; };
+template<> struct __nv_itex_trait<float2> { typedef void type; };
+template<> struct __nv_itex_trait<float4> { typedef void type; };
+
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1Dfetch(T *ptr, cudaTextureObject_t obj, int x)
+{
+   __nv_tex_surf_handler("__itex1Dfetch", ptr, obj, x);
+}
+
+template <class T>
+static __device__ T tex1Dfetch(cudaTextureObject_t texObject, int x)
+{
+  T ret;
+  tex1Dfetch(&ret, texObject, x);
+  return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1D(T *ptr, cudaTextureObject_t obj, float x)
+{
+   __nv_tex_surf_handler("__itex1D", ptr, obj, x);
+}
+
+
+template <class T>
+static __device__  T tex1D(cudaTextureObject_t texObject, float x)
+{
+  T ret;
+  tex1D(&ret, texObject, x);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y)
+{
+   __nv_tex_surf_handler("__itex2D", ptr, obj, x, y);
+}
+
+template <class T>
+static __device__  T tex2D(cudaTextureObject_t texObject, float x, float y)
+{
+  T ret;
+  tex2D(&ret, texObject, x, y);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y, 
+                                                          bool* isResident)
+{
+  unsigned char res;
+   __nv_tex_surf_handler("__itex2D_sparse", ptr, obj, x, y, &res);
+   *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2D(cudaTextureObject_t texObject, float x, float y, bool* isResident)
+{
+  T ret;
+  tex2D(&ret, texObject, x, y, isResident);
+  return ret;
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z)
+{
+   __nv_tex_surf_handler("__itex3D", ptr, obj, x, y, z);
+}
+
+template <class T>
+static __device__  T tex3D(cudaTextureObject_t texObject, float x, float y, float z)
+{
+  T ret;
+  tex3D(&ret, texObject, x, y, z);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z, 
+                                                          bool* isResident)
+{
+  unsigned char res;
+   __nv_tex_surf_handler("__itex3D_sparse", ptr, obj, x, y, z, &res);
+   *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex3D(cudaTextureObject_t texObject, float x, float y, float z, bool* isResident)
+{
+  T ret;
+  tex3D(&ret, texObject, x, y, z, isResident);
+  return ret;
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLayered(T *ptr, cudaTextureObject_t obj, float x, int layer)
+{
+   __nv_tex_surf_handler("__itex1DLayered", ptr, obj, x, layer);
+}
+
+template <class T>
+static __device__  T tex1DLayered(cudaTextureObject_t texObject, float x, int layer)
+{
+  T ret;
+  tex1DLayered(&ret, texObject, x, layer);
+  return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer)
+{
+  __nv_tex_surf_handler("__itex2DLayered", ptr, obj, x, y, layer);
+}
+
+template <class T>
+static __device__  T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer)
+{
+  T ret;
+  tex2DLayered(&ret, texObject, x, y, layer);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, bool* isResident)
+{
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLayered_sparse", ptr, obj, x, y, layer, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer, bool* isResident)
+{
+  T ret;
+  tex2DLayered(&ret, texObject, x, y, layer, isResident);
+  return ret;
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemap(T *ptr, cudaTextureObject_t obj, float x, float y, float z)
+{
+  __nv_tex_surf_handler("__itexCubemap", ptr, obj, x, y, z);
+}
+
+
+template <class T>
+static __device__  T texCubemap(cudaTextureObject_t texObject, float x, float y, float z)
+{
+  T ret;
+  texCubemap(&ret, texObject, x, y, z);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLayered(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer)
+{ 
+  __nv_tex_surf_handler("__itexCubemapLayered", ptr, obj, x, y, z, layer);
+}
+
+template <class T>
+static __device__  T texCubemapLayered(cudaTextureObject_t texObject, float x, float y, float z, int layer)
+{
+  T ret;
+  texCubemapLayered(&ret, texObject, x, y, z, layer);
+  return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, int comp = 0)
+{
+  __nv_tex_surf_handler("__itex2Dgather", ptr, obj, x, y, comp);
+}
+
+template <class T>
+static __device__  T tex2Dgather(cudaTextureObject_t to, float x, float y, int comp = 0)
+{
+  T ret;
+  tex2Dgather(&ret, to, x, y, comp);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, bool* isResident, int comp = 0)
+{
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2Dgather_sparse", ptr, obj, x, y, comp,  &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2Dgather(cudaTextureObject_t to, float x, float y, bool* isResident, int comp = 0)
+{
+  T ret;
+  tex2Dgather(&ret, to, x, y,  isResident, comp);
+  return ret;
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLod(T *ptr, cudaTextureObject_t obj, float x, float level)
+{
+  __nv_tex_surf_handler("__itex1DLod", ptr, obj, x, level);
+}
+
+template <class T>
+static __device__  T tex1DLod(cudaTextureObject_t texObject, float x, float level)
+{
+  T ret;
+  tex1DLod(&ret, texObject, x, level);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level)
+{
+  __nv_tex_surf_handler("__itex2DLod", ptr, obj, x, y, level);
+}
+
+template <class T>
+static __device__  T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level)
+{
+  T ret;
+  tex2DLod(&ret, texObject, x, y, level);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level, bool* isResident)
+{
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLod_sparse", ptr, obj, x, y, level, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level, bool* isResident)
+{
+  T ret;
+  tex2DLod(&ret, texObject, x, y, level, isResident);
+  return ret;
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level)
+{ 
+  __nv_tex_surf_handler("__itex3DLod", ptr, obj, x, y, z, level);
+}
+
+template <class T>
+static __device__  T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level)
+{
+  T ret;
+  tex3DLod(&ret, texObject, x, y, z, level);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level, bool* isResident)
+{ 
+  unsigned char res;
+  __nv_tex_surf_handler("__itex3DLod_sparse", ptr, obj, x, y, z, level, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level, bool* isResident)
+{
+  T ret;
+  tex3DLod(&ret, texObject, x, y, z, level, isResident);
+  return ret;
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, int layer, float level)
+{ 
+  __nv_tex_surf_handler("__itex1DLayeredLod", ptr, obj, x, layer, level);
+}
+
+template <class T>
+static __device__  T tex1DLayeredLod(cudaTextureObject_t texObject, float x, int layer, float level)
+{
+  T ret;
+  tex1DLayeredLod(&ret, texObject, x, layer, level);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level)
+{ 
+  __nv_tex_surf_handler("__itex2DLayeredLod", ptr, obj, x, y, layer, level);
+}
+
+template <class T>
+static __device__  T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level)
+{
+  T ret;
+  tex2DLayeredLod(&ret, texObject, x, y, layer, level);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level, bool* isResident)
+{ 
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLayeredLod_sparse", ptr, obj, x, y, layer, level, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level, bool* isResident)
+{
+  T ret;
+  tex2DLayeredLod(&ret, texObject, x, y, layer, level, isResident);
+  return ret;
+}
+#endif  /* __NV_TEX_SPARSE */
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level)
+{ 
+  __nv_tex_surf_handler("__itexCubemapLod", ptr, obj, x, y, z, level);
+}
+
+template <class T>
+static __device__  T texCubemapLod(cudaTextureObject_t texObject, float x, float y, float z, float level)
+{
+  T ret;
+  texCubemapLod(&ret, texObject, x, y, z, level);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+  __nv_tex_surf_handler("__itexCubemapGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy);
+}
+
+template <class T>
+static __device__  T texCubemapGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+  T ret;
+  texCubemapGrad(&ret, texObject, x, y, z, dPdx, dPdy);
+  return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float level)
+{ 
+  __nv_tex_surf_handler("__itexCubemapLayeredLod", ptr, obj, x, y, z, layer, level);
+}
+
+template <class T>
+static __device__  T texCubemapLayeredLod(cudaTextureObject_t texObject, float x, float y, float z, int layer, float level)
+{
+  T ret;
+  texCubemapLayeredLod(&ret, texObject, x, y, z, layer, level);
+  return ret;
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DGrad(T *ptr, cudaTextureObject_t obj, float x, float dPdx, float dPdy)
+{
+  __nv_tex_surf_handler("__itex1DGrad", ptr, obj, x, dPdx, dPdy);
+}
+
+template <class T>
+static __device__  T tex1DGrad(cudaTextureObject_t texObject, float x, float dPdx, float dPdy)
+{
+  T ret;
+  tex1DGrad(&ret, texObject, x, dPdx, dPdy);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy)
+{ 
+  __nv_tex_surf_handler("__itex2DGrad_v2", ptr, obj, x, y, &dPdx, &dPdy);
+}
+
+template <class T>
+static __device__  T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy)
+{
+  T ret;
+  tex2DGrad(&ret, texObject, x, y, dPdx, dPdy);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy, bool* isResident)
+{ 
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DGrad_sparse", ptr, obj, x, y, &dPdx, &dPdy, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy, bool* isResident)
+{
+  T ret;
+  tex2DGrad(&ret, texObject, x, y, dPdx, dPdy, isResident);
+  return ret;
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+  __nv_tex_surf_handler("__itex3DGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy);
+}
+
+template <class T>
+static __device__  T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+  T ret;
+  tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident)
+{ 
+  unsigned char res;
+  __nv_tex_surf_handler("__itex3DGrad_sparse", ptr, obj, x, y, z, &dPdx, &dPdy, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident)
+{
+  T ret;
+  tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy, isResident);
+  return ret;
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, int layer, float dPdx, float dPdy)
+{ 
+  __nv_tex_surf_handler("__itex1DLayeredGrad", ptr, obj, x, layer, dPdx, dPdy);
+}
+
+template <class T>
+static __device__  T tex1DLayeredGrad(cudaTextureObject_t texObject, float x, int layer, float dPdx, float dPdy)
+{
+  T ret;
+  tex1DLayeredGrad(&ret, texObject, x, layer, dPdx, dPdy);
+  return ret;
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{ 
+  __nv_tex_surf_handler("__itex2DLayeredGrad_v2", ptr, obj, x, y, layer, &dPdx, &dPdy);
+}
+
+template <class T>
+static __device__  T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+  T ret;
+  tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy);
+  return ret;
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident)
+{ 
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLayeredGrad_sparse", ptr, obj, x, y, layer, &dPdx, &dPdy, &res);
+  *isResident = (res != 0);
+}
+
+template <class T>
+static __device__  T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident)
+{
+  T ret;
+  tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy, isResident);
+  return ret;
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+  __nv_tex_surf_handler("__itexCubemapLayeredGrad_v2", ptr, obj, x, y, z, layer, &dPdx, &dPdy);
+}
+
+template <class T>
+static __device__  T texCubemapLayeredGrad(cudaTextureObject_t texObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+  T ret;
+  texCubemapLayeredGrad(&ret, texObject, x, y, z, layer, dPdx, dPdy);
+  return ret;
+}
+
+#undef __NV_TEX_SPARSE
+
+#endif // __cplusplus && __CUDACC__
+#endif // __TEXTURE_INDIRECT_FUNCTIONS_H__
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/texture_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/texture_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3e3e90ef5c02d7f3e62178792fd10f93ba4d85f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/texture_types.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__TEXTURE_TYPES_H__)
+#define __TEXTURE_TYPES_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "driver_types.h"
+
+#ifndef __CUDACC_RTC_MINIMAL__
+
+/**
+ * \addtogroup CUDART_TYPES
+ *
+ * @{
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#define cudaTextureType1D              0x01
+#define cudaTextureType2D              0x02
+#define cudaTextureType3D              0x03
+#define cudaTextureTypeCubemap         0x0C
+#define cudaTextureType1DLayered       0xF1
+#define cudaTextureType2DLayered       0xF2
+#define cudaTextureTypeCubemapLayered  0xFC
+
+/**
+ * CUDA texture address modes
+ */
+enum __device_builtin__ cudaTextureAddressMode
+{
+    cudaAddressModeWrap   = 0,    /**< Wrapping address mode */
+    cudaAddressModeClamp  = 1,    /**< Clamp to edge address mode */
+    cudaAddressModeMirror = 2,    /**< Mirror address mode */
+    cudaAddressModeBorder = 3     /**< Border address mode */
+};
+
+/**
+ * CUDA texture filter modes
+ */
+enum __device_builtin__ cudaTextureFilterMode
+{
+    cudaFilterModePoint  = 0,     /**< Point filter mode */
+    cudaFilterModeLinear = 1      /**< Linear filter mode */
+};
+
+/**
+ * CUDA texture read modes
+ */
+enum __device_builtin__ cudaTextureReadMode
+{
+    cudaReadModeElementType     = 0,  /**< Read texture as specified element type */
+    cudaReadModeNormalizedFloat = 1   /**< Read texture as normalized float */
+};
+
+/**
+ * CUDA texture descriptor
+ */
+struct __device_builtin__ cudaTextureDesc
+{
+    /**
+     * Texture address mode for up to 3 dimensions
+     */
+    enum cudaTextureAddressMode addressMode[3];
+    /**
+     * Texture filter mode
+     */
+    enum cudaTextureFilterMode  filterMode;
+    /**
+     * Texture read mode
+     */
+    enum cudaTextureReadMode    readMode;
+    /**
+     * Perform sRGB->linear conversion during texture read
+     */
+    int                         sRGB;
+    /**
+     * Texture Border Color
+     */
+    float                       borderColor[4];
+    /**
+     * Indicates whether texture reads are normalized or not
+     */
+    int                         normalizedCoords;
+    /**
+     * Limit to the anisotropy ratio
+     */
+    unsigned int                maxAnisotropy;
+    /**
+     * Mipmap filter mode
+     */
+    enum cudaTextureFilterMode  mipmapFilterMode;
+    /**
+     * Offset applied to the supplied mipmap level
+     */
+    float                       mipmapLevelBias;
+    /**
+     * Lower end of the mipmap level range to clamp access to
+     */
+    float                       minMipmapLevelClamp;
+    /**
+     * Upper end of the mipmap level range to clamp access to
+     */
+    float                       maxMipmapLevelClamp;
+    /**
+     * Disable any trilinear filtering optimizations.
+     */
+    int                         disableTrilinearOptimization;
+    /**
+     * Enable seamless cube map filtering.
+     */
+    int                         seamlessCubemap;
+};
+
+/**
+ * An opaque value that represents a CUDA texture object
+ */
+typedef __device_builtin__ unsigned long long cudaTextureObject_t;
+
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+
+#endif  /* !__CUDACC_RTC_MINIMAL__ */
+#endif /* !__TEXTURE_TYPES_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/vector_functions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/vector_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..38a52e3936ca88f6f5437fe8731b6b1cfdc7ca02
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/vector_functions.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__VECTOR_FUNCTIONS_H__)
+#define __VECTOR_FUNCTIONS_H__
+
+/* NOTE: For NVRTC, these declarations have been moved into the compiler 
+   (to reduce compile time) */
+#define EXCLUDE_FROM_RTC
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if defined(__CUDACC_RTC__)
+#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#endif /* __CUDACC_RTC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
+
+__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
+
+__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
+
+__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
+
+__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
+
+__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
+
+__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
+
+__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
+
+__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
+
+__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
+
+__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
+
+__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
+
+__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
+
+__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
+
+__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
+
+__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
+
+__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
+
+__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
+
+__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
+
+__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
+
+__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
+
+__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
+
+__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
+
+__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
+
+__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
+
+__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
+
+__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
+
+__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
+
+__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
+
+__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
+
+__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
+
+__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
+
+__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
+
+__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
+
+__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
+
+__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
+
+__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
+
+__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
+
+__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
+
+__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
+
+__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
+
+__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
+
+__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
+
+__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
+
+#undef __VECTOR_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__)
+#include "vector_functions.hpp"
+#endif /* !__CUDACC_RTC__ */
+
+#undef EXCLUDE_FROM_RTC
+
+#endif /* !__VECTOR_FUNCTIONS_H__ */
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/vector_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/vector_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a755e65f36b56644cd25d08603e10a6efc3fb8b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/vector_types.h
@@ -0,0 +1,449 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__VECTOR_TYPES_H__)
+#define __VECTOR_TYPES_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_VECTOR_TYPES_H__
+#endif
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#ifndef __DOXYGEN_ONLY__
+#include "crt/host_defines.h"
+#endif
+
+/* NVRTC compiler defines these instead of in the header (to reduce compile time)
+*/
+#ifndef __CUDACC_RTC_BUILTIN_VECTOR_TYPES__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && \
+    defined(_WIN32) && !defined(_WIN64)
+
+#pragma warning(push)
+#pragma warning(disable: 4201 4408)
+
+#define __cuda_builtin_vector_align8(tag, members) \
+struct __device_builtin__ tag                      \
+{                                                  \
+    union                                          \
+    {                                              \
+        struct { members };                        \
+        struct { long long int :1,:0; };           \
+    };                                             \
+}
+
+#else /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
+
+#define __cuda_builtin_vector_align8(tag, members) \
+struct __device_builtin__ __align__(8) tag         \
+{                                                  \
+    members                                        \
+}
+
+#endif /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
+
+struct __device_builtin__ char1
+{
+    signed char x;
+};
+
+struct __device_builtin__ uchar1
+{
+    unsigned char x;
+};
+
+
+struct __device_builtin__ __align__(2) char2
+{
+    signed char x, y;
+};
+
+struct __device_builtin__ __align__(2) uchar2
+{
+    unsigned char x, y;
+};
+
+struct __device_builtin__ char3
+{
+    signed char x, y, z;
+};
+
+struct __device_builtin__ uchar3
+{
+    unsigned char x, y, z;
+};
+
+struct __device_builtin__ __align__(4) char4
+{
+    signed char x, y, z, w;
+};
+
+struct __device_builtin__ __align__(4) uchar4
+{
+    unsigned char x, y, z, w;
+};
+
+struct __device_builtin__ short1
+{
+    short x;
+};
+
+struct __device_builtin__ ushort1
+{
+    unsigned short x;
+};
+
+struct __device_builtin__ __align__(4) short2
+{
+    short x, y;
+};
+
+struct __device_builtin__ __align__(4) ushort2
+{
+    unsigned short x, y;
+};
+
+struct __device_builtin__ short3
+{
+    short x, y, z;
+};
+
+struct __device_builtin__ ushort3
+{
+    unsigned short x, y, z;
+};
+
+__cuda_builtin_vector_align8(short4, short x; short y; short z; short w;);
+__cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;);
+
+struct __device_builtin__ int1
+{
+    int x;
+};
+
+struct __device_builtin__ uint1
+{
+    unsigned int x;
+};
+
+__cuda_builtin_vector_align8(int2, int x; int y;);
+__cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;);
+
+struct __device_builtin__ int3
+{
+    int x, y, z;
+};
+
+struct __device_builtin__ uint3
+{
+    unsigned int x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) int4
+{
+    int x, y, z, w;
+};
+
+struct __device_builtin__ __builtin_align__(16) uint4
+{
+    unsigned int x, y, z, w;
+};
+
+struct __device_builtin__ long1
+{
+    long int x;
+};
+
+struct __device_builtin__ ulong1
+{
+    unsigned long x;
+};
+
+#if defined(_WIN32)
+__cuda_builtin_vector_align8(long2, long int x; long int y;);
+__cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;);
+#else /* !_WIN32 */
+
+struct __device_builtin__ __align__(2*sizeof(long int)) long2
+{
+    long int x, y;
+};
+
+struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2
+{
+    unsigned long int x, y;
+};
+
+#endif /* _WIN32 */
+
+struct __device_builtin__ long3
+{
+    long int x, y, z;
+};
+
+struct __device_builtin__ ulong3
+{
+    unsigned long int x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) long4
+{
+    long int x, y, z, w;
+};
+
+struct __device_builtin__ __builtin_align__(16) ulong4
+{
+    unsigned long int x, y, z, w;
+};
+
+struct __device_builtin__ float1
+{
+    float x;
+};
+
+#if !defined(__CUDACC__) && defined(__arm__) && \
+    defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-pedantic"
+
+struct __device_builtin__ __attribute__((aligned(8))) float2
+{
+    float x; float y; float __cuda_gnu_arm_ice_workaround[0];
+};
+
+#pragma GCC poison __cuda_gnu_arm_ice_workaround
+#pragma GCC diagnostic pop
+
+#else /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
+         __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
+
+__cuda_builtin_vector_align8(float2, float x; float y;);
+
+#endif /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
+          __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
+
+struct __device_builtin__ float3
+{
+    float x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) float4
+{
+    float x, y, z, w;
+};
+
+struct __device_builtin__ longlong1
+{
+    long long int x;
+};
+
+struct __device_builtin__ ulonglong1
+{
+    unsigned long long int x;
+};
+
+struct __device_builtin__ __builtin_align__(16) longlong2
+{
+    long long int x, y;
+};
+
+struct __device_builtin__ __builtin_align__(16) ulonglong2
+{
+    unsigned long long int x, y;
+};
+
+struct __device_builtin__ longlong3
+{
+    long long int x, y, z;
+};
+
+struct __device_builtin__ ulonglong3
+{
+    unsigned long long int x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) longlong4
+{
+    long long int x, y, z ,w;
+};
+
+struct __device_builtin__ __builtin_align__(16) ulonglong4
+{
+    unsigned long long int x, y, z, w;
+};
+
+struct __device_builtin__ double1
+{
+    double x;
+};
+
+struct __device_builtin__ __builtin_align__(16) double2
+{
+    double x, y;
+};
+
+struct __device_builtin__ double3
+{
+    double x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) double4
+{
+    double x, y, z, w;
+};
+
+#if !defined(__CUDACC__) && defined(_WIN32) && !defined(_WIN64)
+
+#pragma warning(pop)
+
+#endif /* !__CUDACC__ && _WIN32 && !_WIN64 */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+typedef __device_builtin__ struct char1 char1;
+typedef __device_builtin__ struct uchar1 uchar1;
+typedef __device_builtin__ struct char2 char2;
+typedef __device_builtin__ struct uchar2 uchar2;
+typedef __device_builtin__ struct char3 char3;
+typedef __device_builtin__ struct uchar3 uchar3;
+typedef __device_builtin__ struct char4 char4;
+typedef __device_builtin__ struct uchar4 uchar4;
+typedef __device_builtin__ struct short1 short1;
+typedef __device_builtin__ struct ushort1 ushort1;
+typedef __device_builtin__ struct short2 short2;
+typedef __device_builtin__ struct ushort2 ushort2;
+typedef __device_builtin__ struct short3 short3;
+typedef __device_builtin__ struct ushort3 ushort3;
+typedef __device_builtin__ struct short4 short4;
+typedef __device_builtin__ struct ushort4 ushort4;
+typedef __device_builtin__ struct int1 int1;
+typedef __device_builtin__ struct uint1 uint1;
+typedef __device_builtin__ struct int2 int2;
+typedef __device_builtin__ struct uint2 uint2;
+typedef __device_builtin__ struct int3 int3;
+typedef __device_builtin__ struct uint3 uint3;
+typedef __device_builtin__ struct int4 int4;
+typedef __device_builtin__ struct uint4 uint4;
+typedef __device_builtin__ struct long1 long1;
+typedef __device_builtin__ struct ulong1 ulong1;
+typedef __device_builtin__ struct long2 long2;
+typedef __device_builtin__ struct ulong2 ulong2;
+typedef __device_builtin__ struct long3 long3;
+typedef __device_builtin__ struct ulong3 ulong3;
+typedef __device_builtin__ struct long4 long4;
+typedef __device_builtin__ struct ulong4 ulong4;
+typedef __device_builtin__ struct float1 float1;
+typedef __device_builtin__ struct float2 float2;
+typedef __device_builtin__ struct float3 float3;
+typedef __device_builtin__ struct float4 float4;
+typedef __device_builtin__ struct longlong1 longlong1;
+typedef __device_builtin__ struct ulonglong1 ulonglong1;
+typedef __device_builtin__ struct longlong2 longlong2;
+typedef __device_builtin__ struct ulonglong2 ulonglong2;
+typedef __device_builtin__ struct longlong3 longlong3;
+typedef __device_builtin__ struct ulonglong3 ulonglong3;
+typedef __device_builtin__ struct longlong4 longlong4;
+typedef __device_builtin__ struct ulonglong4 ulonglong4;
+typedef __device_builtin__ struct double1 double1;
+typedef __device_builtin__ struct double2 double2;
+typedef __device_builtin__ struct double3 double3;
+typedef __device_builtin__ struct double4 double4;
+
+#undef  __cuda_builtin_vector_align8
+
+#endif /* !defined(__CUDACC_RTC_BUILTIN_VECTOR_TYPES__) */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+struct __device_builtin__ dim3
+{
+    unsigned int x, y, z;
+#if defined(__cplusplus)
+#if __cplusplus >= 201103L
+    __host__ __device__ constexpr dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
+    __host__ __device__ constexpr dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
+    __host__ __device__ constexpr operator uint3(void) const { return uint3{x, y, z}; }
+#else
+    __host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
+    __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
+    __host__ __device__ operator uint3(void) const { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
+#endif
+#endif /* __cplusplus */
+};
+
+typedef __device_builtin__ struct dim3 dim3;
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_VECTOR_TYPES_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_VECTOR_TYPES_H__
+#endif
+
+#endif /* !__VECTOR_TYPES_H__ */